# Month-level Student Performance Prediction
- this tidy version is for publishing purpose and excludes EDA

## Set up 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from collections import Counter
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
# import sys
# sys.version
# # 3.7.6
# # pd.__version__
# # # '1.1.4'
# # np.__version__
# # '1.18.5'

In [None]:
# Constants
# - par
ORDER_MONTH = ['2018-08','2018-09','2018-10','2018-11','2018-12',
               '2019-01','2019-02','2019-03','2019-04','2019-05','2019-06','2019-07']
VARS_REDUNDANT = ['is_downgrade','is_upgrade']
RUN_BY_IND_MONTH = True
# exclude the first month as per exploratory data analysis
MONTH_EXCLUDED = ['2018-08']

# - path
PATH_INPUT = '/kaggle/input/learning-activity-public-dataset-by-junyi-academy/'
PATH_PREPROCESSED_INPUT = '../input/junyi-preprocessed/'
PATH_OUTPUT = '/kaggle/working/'
# - file
# -- raw timestamp
FILE_LOG_FULL = os.path.join(PATH_PREPROCESSED_INPUT ,'Log_Problem_raw_timestamp.parquet.gzip')
# -- rounded timestamp
FILE_USER = os.path.join(PATH_INPUT,'Info_UserData.csv')

In [None]:
# Read in the log file
df_log = pd.read_parquet(FILE_LOG_FULL)
 
# Read in the user file
df_user = pd.read_csv(FILE_USER)

In [None]:
# Preprocessing
# - drop redundant columns
df_log = df_log.drop(columns = VARS_REDUNDANT)   
# - create variables
# -- create a 'year-month' variable for grouping purpose
df_log['year_month'] = df_log['timestamp_TW'].str[:7]

In [None]:
# join the "user_grade" info
df_log = pd.merge(df_log,df_user[['uuid','user_grade']],on='uuid',how='left')

## Descriptive Statistics: By-month By-individual 

This should be the basis for "student performance prediction".

### By-month by-individual average absolute accuracy (AAA)

$$AAA = \frac{\text{# correct attemps in a month}}{\text{# attempts in a month}}$$

In [None]:
if RUN_BY_IND_MONTH:
    # create the by-month by-individual AAA variable
    df_by_month_ind = df_log.groupby(by=['year_month','uuid']).agg(n_logs = ('is_correct','count'),
                                                                   accuracy = ('is_correct','mean'),
                                                                   user_grade = ('user_grade','first')).reset_index()
    # exclude the first and last month
    df_by_month_ind = df_by_month_ind[~df_by_month_ind['year_month'].isin(MONTH_EXCLUDED)]

In [None]:
if RUN_BY_IND_MONTH:
    # only retain those with at least 15 logs for every month
    MIN_LOGS_MONTH = 15
    # - remove rows (user-month) below the threshold
    df_by_month_ind = df_by_month_ind[df_by_month_ind.n_logs>=MIN_LOGS_MONTH]

# - Before removal
# df_by_month_ind.uuid.nunique()
# > 70683
# - After removal
# df_by_month_ind.uuid.nunique()
# > 53995

In [None]:
if RUN_BY_IND_MONTH:
    # check how many months has each student been active
    df_user_active_months = df_by_month_ind.groupby(by=['uuid']).agg(n_active_months = ('year_month','nunique'))
    # join the active months info to `df_by_month_ind`
    df_by_month_ind = df_by_month_ind.merge(df_user_active_months,on='uuid')

In [None]:
if RUN_BY_IND_MONTH:
    # Note: there are only 317 users that are active across all 11 months
    print(sorted(Counter(df_user_active_months['n_active_months']).items(),key = lambda i: i[0]))

### By-month by-individual average relative average accuracy (RAA)
$$ RAA = z(AAA) = \frac{\text{AAA - mean AAA  of the same grade}}{\text{SD AAA of the same grade}} $$

In [None]:
if RUN_BY_IND_MONTH:
    # group average accuracy
    df_by_month_accurcy = df_log.groupby(['year_month','user_grade']).agg(month_accuracy_mean = ('is_correct','mean'),
                                                                          month_accuracy_sd = ('is_correct','std'))

In [None]:
if RUN_BY_IND_MONTH:
    df_by_month_ind = df_by_month_ind.merge(df_by_month_accurcy, on = ['year_month','user_grade'])
    df_by_month_ind['relative_accuracy'] = (df_by_month_ind['accuracy'] - df_by_month_ind['month_accuracy_mean'])/df_by_month_ind['month_accuracy_sd']

## Predict Last Month RAA

1. Work on the students who have 11 active months (n = 317) first.

Predict the student’s performance in the last month  
- Features (X):  
    - Student grade  
    - Avg (across 10 months):
        - AVG_ACC: Accuracy; "Is_correct"
        - AVG_RT: RT; "total_sec_taken"
        - AVG_LEVEL: Level; "Level"
    - Avg 1st derivative (across 9 differences):
        - AVG_DIF_ACC: slope of accuracy; "avg_acc"
        - ACG_DIF_RT: slope of RT; "avg_rt"
        - AVG_DIF_LEVEL: slope of level; "avg_level"      
- Labels (y):   
    - RAA of the last month  
- Model:  
    - Linear regression  
- Evaluate Model performance:
    - k-fold R^2 (use 5 folds to evaluate model performance)
    - k-fold MSE (use 5 folds to evaluate model performance)
- Evaluate feature significance
    - generalized likelihood ratio test (GLRT)

## Prepare the df which contains the features and the label

In [None]:
# get the df with active users only
list_user_active = df_user_active_months[df_user_active_months['n_active_months']==11].index
df_log_user_active = df_log[df_log.uuid.isin(list_user_active) & ~df_log['year_month'].isin(MONTH_EXCLUDED)]

In [None]:
month_last = df_log_user_active.year_month.max()
month_past = pd.Series(ORDER_MONTH)
month_past = month_past[(month_past != month_last) & (~month_past.isin(MONTH_EXCLUDED))]

In [None]:
# create the features based on the past months and demo
# - create the df of by-month ACC/RT/level
df_past_by_month_feature = \
df_log_user_active[df_log_user_active.year_month.isin(month_past)].groupby(['year_month','uuid']).agg(month_acc_mean = ('is_correct','mean'),
                                                                            month_rt_mean = ('total_sec_taken','mean'),
                                                                            month_level_mean = ('level','mean')).reset_index()
# - create the vars of diff 
df_past_by_month_feature['dif_acc'] = np.concatenate((np.array([0]),df_past_by_month_feature['month_acc_mean'][1:].to_numpy() - df_past_by_month_feature['month_acc_mean'][:-1].to_numpy()))
df_past_by_month_feature['dif_rt'] = np.concatenate((np.array([0]),df_past_by_month_feature['month_rt_mean'][1:].to_numpy() - df_past_by_month_feature['month_rt_mean'][:-1].to_numpy()))
df_past_by_month_feature['dif_level'] = np.concatenate((np.array([0]),df_past_by_month_feature['month_level_mean'][1:].to_numpy() - df_past_by_month_feature['month_level_mean'][:-1].to_numpy()))

# create the by-ind 10-month average feature
df_past_feature = df_past_by_month_feature.groupby(['uuid']).agg(avg_acc = ('month_acc_mean','mean'),
                                                               avg_rt = ('month_rt_mean','mean'),
                                                               avg_level = ('month_level_mean','mean'),
                                                               avg_dif_acc = ('dif_acc','mean'),
                                                               avg_dif_rt = ('dif_rt','mean'),
                                                               avg_dif_level = ('dif_level','mean')).reset_index()
# - add user grade
df_past_feature  = pd.merge(df_past_feature ,df_user[['uuid','user_grade']],on='uuid',how='left')

In [None]:
# create the label
# - add the RAA of the last month
df_past_feature = df_past_feature.merge(df_by_month_ind[(df_by_month_ind.year_month == month_last) & (df_by_month_ind.uuid.isin(list_user_active))][['uuid','relative_accuracy']],
                                        on = 'uuid').rename(columns={'relative_accuracy':'last_RAA'})

## Train the model with multiple linear regression

In [None]:
df_past_feature.head()

In [None]:
from sklearn.linear_model import LinearRegression as lm
from sklearn import preprocessing
# import sklearn
# sklearn.__version__
# # '0.23.2'

In [None]:
# feature matrix
mX = df_past_feature.loc[:,~df_past_feature.columns.isin(["last_RAA","uuid"])]
# label vector
y = df_past_feature.loc[:,"last_RAA"]

print('X shape is = ', mX.shape)
print('y shape is = ', y.shape)

#### Min-max transformation


In [None]:
mX = preprocessing.MinMaxScaler().fit_transform(mX)

#### Train the model

In [None]:
model_linear = lm(fit_intercept=True)
model_linear = model_linear.fit(mX, y)

In [None]:
# visualize the fitted value vs. true value
y_pred = model_linear.predict(mX)
plt.figure(figsize=(8,8))
g = sns.scatterplot(x=y, y=y_pred)
g.set(xlabel='Ground-truth Last-Month RAA', ylabel='Predicted Last-Month RAA')
plt.show()

### Evaluate the model with k-fold cross-validation

In [None]:
from sklearn.model_selection import KFold, cross_val_score
# k-fold R^2
# - k = 5
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
list_r2_kfold = cross_val_score(model_linear, mX, y, scoring='r2', cv=folds)
mean_r2_kfold  = np.mean(list_r2_kfold)
print("The average model R^2 across 5 folds is: "+ str(round(mean_r2_kfold*100,2))+"%")

# The average model R^2 across 5 folds is: 41.55%

In [None]:
# k-fold MSE
list_mse_kfold = cross_val_score(model_linear, mX, y, scoring='neg_mean_squared_error', cv=folds)
mean_mse_kfold  = np.mean(-list_mse_kfold)
print("The average model MSE across 5 folds is: "+ str(round(mean_mse_kfold,2)))

# The average model MSE across 5 folds is: 0.05

### Evaluate feature significance
- generalized likelihood ratio test (GLRT)

In [None]:
from scipy.stats import chi2

In [None]:
# number of examples
n = len(y)
# number of features
m = mX.shape[1]
# preppend the bias term
mX_bias = np.concatenate([np.ones((n,1)),mX],axis = 1)
names_features = ["bias","avg_acc","avg_rt","avg_level","avg_dif_acc","avg_dif_rt","avg_dif_level","user_grade"]
# # theta hat
# # - (XT*X)-1 XT y
theta_hat = np.linalg.inv(np.transpose(mX_bias) @ mX_bias) @ np.transpose(mX_bias) @ y
print("theta hat = ")
print(np.round(theta_hat,3))

#  sigma hat
# - [(1/N)(y - X * theta)T (y-X theta)]
scalar_sigma_squared_hat = (1 / n) * np.transpose(y - mX_bias @ theta_hat) @ (y - (mX_bias @ theta_hat))

# significance test
cov_theta_hat = scalar_sigma_squared_hat * np.linalg.inv(np.matmul(np.transpose(mX_bias), mX_bias))
for i_theta, name_theta in enumerate(names_features):
    theta= theta_hat[i_theta]
    nu_j_squared = cov_theta_hat[i_theta, i_theta]
    stat = (theta**2 / nu_j_squared)
    threshold = chi2.ppf(0.95, df = 1)
    
    if stat > threshold:
        print("H1 is supported, i.e.,"+name_theta+" is a significant feature for this model.")
    else:
        print("H0 is supported, i.e.,"+name_theta+" is not a significant feature for this model.")
        
    p_value = chi2.sf(stat, df = 1)
    print("p value of "+name_theta+ " = "+str(p_value))    
# theta hat = 
# [-0.817  1.377 -0.004 -0.333  0.179 -0.049  0.049  0.024]
# H1 is supported, i.e.,bias is a significant feature for this model.
# p value of bias = 2.2129523776122797e-19
# H1 is supported, i.e.,avg_acc is a significant feature for this model.
# p value of avg_acc = 1.5038429466665508e-26
# H0 is supported, i.e.,avg_rt is not a significant feature for this model.
# p value of avg_rt = 0.9744408936539256
# H1 is supported, i.e.,avg_level is a significant feature for this model.
# p value of avg_level = 0.015551833768520163
# H0 is supported, i.e.,avg_dif_acc is not a significant feature for this model.
# p value of avg_dif_acc = 0.17366994391436194
# H0 is supported, i.e.,avg_dif_rt is not a significant feature for this model.
# p value of avg_dif_rt = 0.7142351840297816
# H0 is supported, i.e.,avg_dif_level is not a significant feature for this model.
# p value of avg_dif_level = 0.7285586745395831
# H0 is supported, i.e.,user_grade is not a significant feature for this model.
# p value of user_grade = 0.7546338569087025