# Digital Education & Learning analytics - Project part. 2

In [223]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

%matplotlib inline

In [2]:
df = pd.read_csv('OutputTable.csv')
df.fillna(0, inplace=True)
df.query('SubmissionNumber > 0', inplace=True)
df['NVideoAndForum'] = df['NVideoEvents'].add(df['NForumEvents'])
df.head()
df.columns

Index(['ProblemID', 'UserID', 'SubmissionNumber', 'TimeStamp', 'TimeSinceLast',
       'Grade', 'GradeDiff', 'NVideoEvents', 'NForumEvents', 'NumberOfPosts',
       'TimeSpentOnForum', 'DurationOfVideoActivity', 'RewatchingScore',
       'NumberOfThreadCreated', 'LastVideoEvent', 'NumberVideoWatched',
       'NumberOfSlowPlay', 'NumberOfThreadViews', 'LastForumEvent',
       'NumberOfVideoInteractions', 'AverageVideoTimeDiffs', 'NumberOfUpvotes',
       'NVideoAndForum'],
      dtype='object')

In [164]:
def aggregate_df(df):
    aggregations = {
        'SubmissionNumber':{
            'TotalNumberOfSubmissions': 'last'
        },
        'TimeSinceLast':{
            'log_AvgTimeBwSubs': lambda x: np.log(np.mean(x))
        },
        'Grade':{ 
            'GradeDiff': lambda x: x.iloc[-1] - x.iloc[0] if len(x)>1 else x.iloc[0]
        },
        'NVideoEvents':{
            'NVideoEvents': 'sum'
        },    
        'NForumEvents':{
            'NForumEvents': 'sum'
        },
        'NumberOfPosts':{
            'NumberOfPosts': 'sum'
        },
        'TimeSpentOnForum':{
            'log_TimeSpentOnForum': lambda x: np.log(np.sum(x)+1)
        },
        'DurationOfVideoActivity':{
            'DurationOfVideoActivity': 'sum'
        },
        'RewatchingScore':{
            'RewatchingScore': 'sum'
        },
        'NumberOfThreadCreated':{
            'NumberOfThreadCreated': 'sum'
        },
        'NumberVideoWatched':{
            'NumberVideoWatched': 'sum'
        },
        'NumberOfThreadViews':{
            'NumberOfThreadViews': 'sum'
        },
        'NumberOfVideoInteractions':{
            'NumberOfVideoInteractions': 'sum'
        },
        'NumberOfUpvotes':{
            'NumberOfUpvotes': 'sum'
        },
        'NVideoAndForum': {
            'TotalNVideoAndForum': 'sum',
            'ActivityRate': lambda x: np.sum(x !=0) / len(x)
        }
    }
    data = df.groupby(by=['UserID', 'ProblemID'], as_index=True).agg(aggregations)
    data.columns = data.columns.droplevel()
    #data.query('TotalNumberOfSubmissions > 1', inplace=True)
    #data.query('TotalNVideoAndForum > 0', inplace=True)
    return data

In [168]:
data = aggregate_df(df)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,log_TimeSpentOnForum,log_AvgTimeBwSubs,GradeDiff,NumberVideoWatched,NumberOfPosts,NVideoEvents,NumberOfThreadViews,TotalNVideoAndForum,ActivityRate,DurationOfVideoActivity,NumberOfThreadCreated,RewatchingScore,NForumEvents,NumberOfUpvotes,TotalNumberOfSubmissions,NumberOfVideoInteractions
UserID,ProblemID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0009e7ff2e6bad5d80640eeb61cd6e0d,1,0.0,4.59512,12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,3,0.0,5.55296,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,4,0.0,4.276666,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,5,0.0,5.034786,13.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,8,0.0,4.158883,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0


In [169]:
y = data.GradeDiff.as_matrix()
X = data.drop('GradeDiff', axis=1).as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [170]:
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
rmse = make_scorer(rmse_scorer)

## linear regression

In [171]:
reg = linear_model.LinearRegression()
reg.fit(X, y)
y_preds = reg.predict(X_test)

In [198]:
scores = cross_val_score(reg, X_train, y_train, cv=5, scoring=rmse)
scores

array([ 39.92075481,  39.90071365,  38.99189496,  38.78470736,  39.36556329])

## random forest

In [222]:
rf = RandomForestRegressor()
param_grid_rf = { 
    'n_estimators': [100, 200, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [1,3,5,7, 10]
}
CV_rf = GridSearchCV(rf, param_grid=param_grid_rf, cv=5, verbose=2, scoring=rmse)
CV_rf.fit(X, y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] n_estimators=100, max_features=auto, max_depth=1 ................
[CV] . n_estimators=100, max_features=auto, max_depth=1, total=   0.3s
[CV] n_estimators=100, max_features=auto, max_depth=1 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] . n_estimators=100, max_features=auto, max_depth=1, total=   0.2s
[CV] n_estimators=100, max_features=auto, max_depth=1 ................
[CV] . n_estimators=100, max_features=auto, max_depth=1, total=   0.3s
[CV] n_estimators=100, max_features=auto, max_depth=1 ................
[CV] . n_estimators=100, max_features=auto, max_depth=1, total=   0.2s
[CV] n_estimators=100, max_features=auto, max_depth=1 ................
[CV] . n_estimators=100, max_features=auto, max_depth=1, total=   0.3s
[CV] n_estimators=200, max_features=auto, max_depth=1 ................
[CV] . n_estimators=200, max_features=auto, max_depth=1, total=   0.5s
[CV] n_estimators=200, max_features=auto, max_depth=1 ................
[CV] . n_estimators=200, max_features=auto, max_depth=1, total=   0.5s
[CV] n_estimators=200, max_features=auto, max_depth=1 ................
[CV] . n_estimators=200, max_features=auto, max_depth=1, total=   0.5s
[CV] n_estimators=200, max_features=auto, max_depth=1 ................
[CV] .

[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed:  5.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 200, 700], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [1, 3, 5, 7, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(rmse_scorer), verbose=2)

In [239]:
print('best score:', CV_rf.best_score_, '\nwith parameters:', CV_rf.best_params_)

best score: 38.2949561977 
with parameters: {'n_estimators': 200, 'max_features': 'log2', 'max_depth': 1}


## MLP regressor

In [None]:
mlp = MLPRegressor()
param_grid_mlp = {
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [1000],
    'power_t': [0.5],
    'activation': ['logistic', 'tanh', 'relu'],
    'alpha': [0, 0.0001, 0.001, 0.01, 0.1],
    'batch_size': ['auto', 100, 200, 500],
    'early_stopping': [True, False]
}
CV_mlp = GridSearchCV(mlp, param_grid=param_grid_mlp, cv=5, verbose=1, scoring=rmse)
X_scaled = preprocessing.minmax_scale(X)
CV_mlp.fit(X_scaled,y)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


In [238]:
print('best score:', CV_mlp.best_score_, '\nwith parameters:', CV_mlp.best_params_)

best score: 244.973838408 
with parameters: {'learning_rate': 'invscaling', 'power_t': 0.5, 'max_iter': 500}


## Import test data

In [214]:
test_df = pd.read_csv('OutputTable_test.csv')
test_df.fillna(0, inplace=True)
test_df.query('SubmissionNumber > 0', inplace=True)
test_df['NVideoAndForum'] = test_df['NVideoEvents'].add(test_df['NForumEvents'])

In [215]:
test_data = aggregate_df(test_df)
test_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,log_TimeSpentOnForum,log_AvgTimeBwSubs,GradeDiff,NumberVideoWatched,NumberOfPosts,NVideoEvents,NumberOfThreadViews,TotalNVideoAndForum,ActivityRate,DurationOfVideoActivity,NumberOfThreadCreated,RewatchingScore,NForumEvents,NumberOfUpvotes,TotalNumberOfSubmissions,NumberOfVideoInteractions
UserID,ProblemID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
000143a7618ac9cae91b0bf7d059c1fd,4,0.0,5.209486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
000143a7618ac9cae91b0bf7d059c1fd,5,0.0,8.222636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,0.0
000143a7618ac9cae91b0bf7d059c1fd,6,0.0,6.791783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
000143a7618ac9cae91b0bf7d059c1fd,7,0.0,6.280396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
0019bbc70f71e4620273237a1e99bfde,1,0.0,13.3367,0.0,14.0,0.0,196.0,0.0,196.0,0.5,977978.0,0.0,2.0,0.0,0.0,2,89.0


In [216]:
X_test = test_data.drop('GradeDiff', axis=1).as_matrix()
preds = CV_rf.predict(X_test)

In [218]:
test_data['OveralGradeDiff'] = preds
test_data[test_data['OveralGradeDiff'] > 100] = 100
test_data[test_data['OveralGradeDiff'] < 0] = 0
test_data.reset_index(inplace=True)
test_data['uniqRowID'] = test_data.UserID.astype(str) + '_' + test_data.ProblemID.astype(str)

In [219]:
classifier_template = pd.read_csv('data/regression_template.csv')
kaggle_submission = pd.merge(test_data, classifier_template, on='uniqRowID')
kaggle_submission = kaggle_submission[['uniqRowID', 'OveralGradeDiff']]

In [220]:
kaggle_submission.shape

(3767, 2)

In [221]:
kaggle_submission.to_csv('data/regression_results.csv', index=False)