In [None]:
# https://stackoverflow.com/questions/38931566
def background_gradient(s, m=None, M=None, cmap='Reds', low=0, high=0):
    if m is None:
        m = s.min().min()
    if M is None:
        M = s.max().max()
    rng = M - m
    norm = colors.Normalize(m - (rng * low), M + (rng * high))
    normed = s.apply(lambda x: norm(x.values))
    cm = plt.cm.get_cmap(cmap)
    c = normed.applymap(lambda x: colors.rgb2hex(cm(x)))
    ret = c.applymap(lambda x: 'background-color: %s' % x)
    return ret

In [None]:
%load_ext autoreload
%autoreload 2
from matplotlib import colors
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import math

In [None]:
from scipy.stats import truncnorm

# https://stackoverflow.com/questions/36894191/how-to-get-a-normal-distribution-within-a-range-in-numpy
def get_truncated_normal(mean, sd, size, low, upp):
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd).rvs(size)

In [None]:
num_students = 30

mean_A = 80
std_A = 20

mean_E = 75
std_E = 25

grades_A = get_truncated_normal(mean_A, std_A, num_students, 0, 100)  # np.random.normal(mean_A, std_A, num_A)
grades_E = get_truncated_normal(mean_E, std_E, num_students, 0, 100)  # np.random.normal(mean_B, std_B, num_B)

fig, ax = plt.subplots(2, 2, figsize=(30,15))

sns.set(color_codes=True)
ax[0][0].set_title("Actual grades PDF")
sns.distplot(grades_A, kde=True, rug=False, ax=ax[0][0])
ax[0][0].set_title("Estimate grades PDF")
sns.distplot(grades_B, kde=True, rug=False, ax=ax[0][1])
sns.distplot(grades_A, kde=False, rug=True, ax=ax[1][0])
sns.distplot(grades_B, kde=False, rug=True, ax=ax[1][1])

In [None]:
num_bins = 10
bins = np.linspace(0, 100, num_bins + 1)

# Cut creates a mapping of each value to the appropriate bin
cut_A = pd.cut(grades_A, bins=bins, include_lowest=True, right=True)
cut_E = pd.cut(grades_E, bins=bins, include_lowest=True, right=True)

pd.DataFrame({'grades': grades_A, 'cut': cut_A})


In [None]:
df_A = pd.DataFrame({'bucket':cut_A}).reset_index()
df_E = pd.DataFrame({'bucket': cut_E}).reset_index()

In [None]:
merged_df = pd.merge(df_A, df_E, on=['index'], suffixes=('_actual', '_estimated'))

# 
  # https://stackoverflow.com/a/43921476/768439
#   m, n = len(merged_df.index.levels[0]), len(merged_df.index.levels[1])
#   trans_matrix = merged_df.values.reshape(m, n)


In [None]:
merged_df = merged_df.groupby(['bucket_actual', 'bucket_estimated']).count()
merged_df

In [None]:
# https://stackoverflow.com/a/43921476/768439
# Simply converting a multi-leveled pandas dataframe into a 2d numpy array
m, n = len(merged_df.index.levels[0]), len(merged_df.index.levels[1])
trans_matrix = merged_df.values.reshape(m, n)

In [None]:
merged_df.values.reshape(m, n)

In [None]:
total = num_students # len(df)
axis = np.linspace(5, 105, num_bins + 1)[:-1]
trans_df = pd.DataFrame(trans_matrix, columns=axis, index=axis)
trans_df = trans_df.applymap(lambda val: 0 if math.isnan(val) else round(val / total, 2))
trans_df

In [None]:
trans_df.columns.name = 'Estimated'
trans_df.index.name = 'Actual'
trans_df.style.set_caption("Transition matrix").apply(background_gradient, high=1, axis=None)