In [None]:
# set a random seed for repeatable results, if you wish.
# Unneeded due to 50 runs of each model, do not run in most circumstances
from numpy.random import seed
seed(7567)
from tensorflow import set_random_seed
set_random_seed(7567)

In [None]:
# import all libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from datagen import *
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# for GPU usage, may need to remove for your setup
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.compat.v1.Session(config=config)

In [None]:
# run to use LaTeX fonts in matlibplot plots. Currently not working
#plt.rcParams.update({
#    "text.usetex": True,
#    "font.family": "sans-serif",
#    "font.sans-serif": ["Computer Modern Sans Serif"]})
# for Palatino and other serif fonts use:
#plt.rcParams.update({
#    "text.usetex": True,
#    "font.family": "serif",
#    "font.serif": ["Computer Modern Roman"],
#})

In [None]:
#variables to edit and run statistics collection. Cell below must be run first.

num_runs=50                                     # number of iterations to run per each configuration, results are averaged out.
dataset_sizes=[2500, 5000, 10000, 20000, 50000] # comma-separated values of dataset size to try
batch_size=32                                   # batch size of neural network model
epochs=[25, 50, 100, 200]                       # comma separated values of training length to try
verbose=0                                       # whether to print model training output: 0 for none, 1 for yes, 2 for debug

model, data_matrix = collect_statistics(num_runs, dataset_sizes, batch_size, epochs, verbose)

In [None]:
# defining function for collecting statistics, cell for running is above.
le = LabelEncoder()
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2])], remainder='passthrough')
ct2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [7])], remainder='passthrough')
ct3 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [18])], remainder='passthrough')
scalarX, scalarY = MinMaxScaler(), MinMaxScaler()

TARS = tf.keras.models.Sequential()
TARS.add(tf.keras.layers.Dense(input_dim = 29 ,units = 50, activation='relu'))
TARS.add(tf.keras.layers.Dense(units=30, activation='relu'))
TARS.add(tf.keras.layers.Dense(units=50, activation='relu'))
TARS.add(tf.keras.layers.Dense(units=1, activation='linear'))
TARS.compile(optimizer="adam", loss="mean_squared_error", metrics=['mse'])

def collect_statistics(num_runs, num_rowss, batch_size, epochss, verbose):

    matrix=[]
    for epochs in epochss:
        epoch_row=[]
        for num_rows in num_rowss:
            loss_array=[]
            for i in range(num_runs):
                dframe = pd.DataFrame()
                dframe = datagen(num_rows)
                x = dframe.iloc[:, 1:-1].values
                y = dframe.iloc[:, -1].values
                x[:, 0] = le.fit_transform(x[:,0])
                x = np.array(ct.fit_transform(x))
                x = np.array(ct2.fit_transform(x))
                x = np.array(ct3.fit_transform(x))
                scalarX.fit(x)
                scalarY.fit(y.reshape(num_rows,1))
                x = scalarX.transform(x)
                y = scalarY.transform(y.reshape(num_rows,1))
                x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
                for ix, layer in enumerate(TARS.layers):
                    if hasattr(TARS.layers[ix], 'kernel_initializer') and hasattr(TARS.layers[ix], 'bias_initializer'):
                        weight_initializer=TARS.layers[ix].kernel_initializer
                        bias_initializer=TARS.layers[ix].bias_initializer

                        old_weights, old_biases = TARS.layers[ix].get_weights()

                        TARS.layers[ix].set_weights([weight_initializer(shape=old_weights.shape), bias_initializer(shape=len(old_biases))])
                TARS.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=verbose)
                error=TARS.evaluate(x_test, y_test, verbose=0)[0]
                #print(f"Run {i} finished, mean_squared_error: {error}")
                y_pred = TARS.predict(x_test)
                y_test = scalarY.inverse_transform(y_test)
                y_pred = scalarY.inverse_transform(y_pred)
                explained_variance = 1 - np.var(y_test - y_pred)/np.var(y_test)
                mse = (sum((y_test-y_pred)**2)/len(y_pred))[0]
                loss_array.append([error, explained_variance, mse])
            print(f"Epoch number: {epochs}, num_rows: {num_rows}, model loss: {sum([a[0] for a in loss_array])/len(loss_array)}, explained variance: {sum([a[1] for a in loss_array])/len(loss_array)*100}%, mse: {sum([a[2] for a in loss_array])/len(loss_array)}")
            epoch_row.append(loss_array)
        matrix.append(epoch_row)
    
    return TARS, matrix

PLOTTING VARIABLES/DATA BELOW

In [None]:
# plot basic diagram of model

tf.keras.utils.plot_model(
    TARS,
    show_shapes=True,
    show_dtype=False,
    show_layer_names=False,
    rankdir="TB",
    expand_nested=True,
    dpi=96,
)

In [None]:
# show generated simulated data
dframe = datagen(20000)
dframe

In [None]:
# show distribution of GPA difference variable

n, bins, patches = plt.hist(x=dframe['gpadifference'], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('GPAs')
plt.ylabel('Frequency')
plt.title('GPA Difference')

In [None]:
#multi-graph matrix display of all data

from pandas.plotting import scatter_matrix
led = LabelEncoder()
le_dframe = dframe
le_dframe[['gender', 'teacher_cred', 'disability', 'accomadation']] = le_dframe[['gender', 'teacher_cred', 'disability', 'accomadation']].apply(led.fit_transform)
scatter_matrix(le_dframe.iloc[:, 1:], alpha=0.2, figsize=(12, 12), diagonal='kde')

In [None]:
dframe.query("disability == 'Autism'").groupby(['teacher_cred']).mean()['gpadifference'].plot.bar(data, figsize=(5,5), color = 'red')
plt.xlabel('Teacher Credentials')
plt.ylabel('Predicted GPA Gain')
plt.xticks(rotation=0)
plt.title('Predicted GPA Gain by Credentials of Teacher')
plt.savefig("pattern.svg", format="svg", dpi=300, bbox_inches='tight')

In [None]:
# predict function for individual students 
# If given all variables, returns expected GPA difference
# If not given accomodation, returns predicted most effective accomodation for student and plots it
# Need to run cell below to use

# input: gender, age, teacher_cred, class size, disability, optional: accommodation 
predict(np.array([["Female", "15", "Bachelor's", "25", "ADHD"]]))

In [None]:
# predict function for individual students 
# If given all variables, returns expected GPA difference
# If not given accomodation, returns predicted most effective accomodation and plots

def predict(sample_student):
    maxdif = -4
    bestaccom = ""
    accomodation_list = ["Materials in Braille", "Text to Speech Devices", "Breakout Corner", "Use of Toy in Class", "Bigger Print Materials", "Isolated Workstation", "Tutoring Sessions", "Book Buddy", "Use of Calculator on Tests", "AAC Devices", "Special Education Classroom"]
    if sample_student[0][-1] in accomodation_list:
        accomodation_list = [sample_student[0][-1]]
    gpadiffs = []
    for i in accomodation_list:
        temparray= [[]] 
        temparray[0] = np.append(sample_student, i)
        temparray = np.array(temparray)
        temparray[:, 0] = le.transform(temparray[:,0])
        temparray = np.array(ct.transform(temparray))
        temparray = np.array(ct2.transform(temparray))
        temparray = np.array(ct3.transform(temparray))
        temparray = scalarX.transform(temparray)
        gpadiffs.append(scalarY.inverse_transform(TARS.predict(np.array(temparray)))[0][0])
        if (scalarY.inverse_transform(TARS.predict(np.array(temparray))) > maxdif):
            maxdif = scalarY.inverse_transform(TARS.predict(np.array(temparray)))
            bestaccom = i
    
    if len(gpadiffs) > 1:
        fig = plt.figure(figsize=(7,7))
        ax = fig.add_axes([0,0,1,1])
        plt.xticks(rotation=45)
        ax.bar(accomodation_list, gpadiffs)
        print(f"{bestaccom} is the predicted best accomodation.")
    else:
        print(f"Predicted GPA Difference for this student is: {gpadiffs[0]}.")