# Digital Education & Learning analytics - Project part. 2

In [38]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('OutputTable.csv')
df.fillna(0, inplace=True)
df.query('SubmissionNumber > 0', inplace=True)
df['NVideoAndForum'] = df['NVideoEvents'].add(df['NForumEvents'])
df.head()
df.columns

Index(['ProblemID', 'UserID', 'SubmissionNumber', 'TimeStamp', 'TimeSinceLast',
       'Grade', 'GradeDiff', 'NVideoEvents', 'NForumEvents', 'NumberOfPosts',
       'TimeSpentOnForum', 'DurationOfVideoActivity', 'RewatchingScore',
       'NumberOfThreadCreated', 'LastVideoEvent', 'NumberVideoWatched',
       'NumberOfSlowPlay', 'NumberOfThreadViews', 'LastForumEvent',
       'NumberOfVideoInteractions', 'AverageVideoTimeDiffs', 'NumberOfUpvotes',
       'NVideoAndForum'],
      dtype='object')

In [14]:
def aggregate_df(df):
    aggregations = {
        'SubmissionNumber':{
            'TotalNumberOfSubmissions': 'last'
        },
        'TimeSinceLast':{
            'AvgTimeBwSubs': lambda x: np.log(np.mean(x))
        },
        'Grade':{ 
            'GradeDiff': lambda x: x.iloc[-1] - x.iloc[0]
        },
        'NVideoEvents':{
            'NVideoEvents': 'sum'
        },    
        'NForumEvents':{
            'NForumEvents': 'sum'
        },
        'NumberOfPosts':{
            'NumberOfPosts': 'sum'
        },
        'TimeSpentOnForum':{
            'TimeSpentOnForum': 'sum'
        },
        'DurationOfVideoActivity':{
            'DurationOfVideoActivity': 'sum'
        },
        'RewatchingScore':{
            'RewatchingScore': 'sum'
        },
        'NumberOfThreadCreated':{
            'NumberOfThreadCreated': 'sum'
        },
        'NumberVideoWatched':{
            'NumberVideoWatched': 'sum'
        },
        'NumberOfThreadViews':{
            'NumberOfThreadViews': 'sum'
        },
        'NumberOfVideoInteractions':{
            'NumberOfVideoInteractions': 'sum'
        },
        'NumberOfUpvotes':{
            'NumberOfUpvotes': 'sum'
        },
        'NVideoAndForum': {
            'TotalNVideoAndForum': 'sum',
            'ActivityRate': lambda x: np.sum(x !=0) / len(x)
        }
    }
    data = df.groupby(by=['UserID', 'ProblemID'], as_index=True).agg(aggregations)
    data.columns = data.columns.droplevel()
    data.query('TotalNumberOfSubmissions > 1', inplace=True)
    #data.query('TotalNVideoAndForum > 0', inplace=True)
    return data

In [16]:
data = aggregate_df(df)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NVideoEvents,NumberOfPosts,TimeSpentOnForum,NumberOfVideoInteractions,GradeDiff,NForumEvents,RewatchingScore,AvgTimeBwSubs,DurationOfVideoActivity,TotalNumberOfSubmissions,NumberOfThreadCreated,TotalNVideoAndForum,ActivityRate,NumberOfThreadViews,NumberVideoWatched,NumberOfUpvotes
UserID,ProblemID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0009e7ff2e6bad5d80640eeb61cd6e0d,1,0.0,0.0,0.0,0.0,12.5,0.0,0.0,4.59512,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,4,0.0,0.0,0.0,0.0,100.0,0.0,0.0,4.276666,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,5,0.0,0.0,0.0,0.0,13.333333,0.0,0.0,5.034786,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,11,0.0,0.0,2328.0,0.0,44.444444,30.0,0.0,6.482087,0.0,15,0.0,30.0,0.133333,14.0,0.0,0.0
0009e7ff2e6bad5d80640eeb61cd6e0d,14,0.0,0.0,51.0,0.0,100.0,12.0,0.0,7.757829,0.0,22,0.0,12.0,0.227273,6.0,0.0,0.0


In [25]:
y = data.GradeDiff.as_matrix()
X = data.drop('GradeDiff', axis=1).as_matrix()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Baseline model - linear regression

In [26]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
y_pred = reg.predict(X_test)
mean_squared_error(y_test, y_pred)

1242.7413713240019

In [39]:
scores = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
scores

array([-1270.95980092, -1410.0532485 , -1256.8083732 , -1280.21961331,
       -1328.30232063])