In this notebook, we will finally create predictive features using the logs we cleaned on notebook 2.2. Our focus, for now, will be prediction using an aggregate non-temporal representation of each student.

Throughout the notebook, we will start with the import of logs and remaining tables that we consider to be relevant for feature engineering and extraction.

#### 1. Importing the relevant packages, setting global variables and importing the relevant files

In [2]:
#import libs
import pandas as pd
import numpy as np

#viz related tools
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import LogNorm, Normalize
from matplotlib.ticker import MaxNLocator
import matplotlib as mpl
from matplotlib import cm
import seaborn as sns

#tqdm to monitor progress
from tqdm.notebook import tqdm, trange
tqdm.pandas(desc="Progress")

#starting with other tools
sns.set_theme(context='paper', style='whitegrid', font='Calibri', rc={"figure.figsize":(20, 12)}, font_scale=2)
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
#global variables that may come in handy
#course threshold sets the % duration that will be considered (1 = 100%)
course_threshold = 1

#colors for vizualizations
nova_ims_colors = ['#BFD72F', '#5C666C']

#standard color for student aggregates
student_color = '#474838'

#standard color for course aggragates
course_color = '#1B3D2F'

#standard continuous colormap
standard_cmap = 'viridis_r'

In [14]:
#loading student log data 
student_logs = pd.read_csv('../Data/Modeling Stage/R_Gonz_cleaned_logs.csv', 
                           dtype = {
                                   'id': object,
                                   'itemid': object,
                                   'userid': object,
                                   'course': object,
                                   'cmid': object,
                                   },).drop('Unnamed: 0', axis = 1).dropna(how = 'all', axis = 1) #logs

#loading support table
support_table = pd.read_csv('../Data/R_Gonz_support_table.csv', 
                           dtype = {
                                   'assign_id': object,
                                   'courseid': object,
                                   'userid': object,
                                   }, 
                            parse_dates = ['sup_time', 'startdate']).drop('Unnamed: 0', axis = 1).dropna(how = 'all', axis = 1)

#save tables 
class_list = pd.read_csv('../Data/Modeling Stage/R_Gonz_class_duration.csv', 
                         dtype = {
                                   'course': object,                                   
                                   },
                        parse_dates = ['Start Date','End Date', 'cuttoff_point']).drop('Unnamed: 0', axis = 1)

#targets tables 
targets_table = pd.read_csv('../Data/Modeling Stage/R_Gonz_targets_table.csv',
                           dtype = {
                                   'userid': object,
                                   'courseid': object,
                                   },)

In [20]:
student_logs

Unnamed: 0,id,time,userid,ip,course,module,cmid,action,url,info
0,262124.0,2014-08-25 12:03:21,68582.0,127.0.0.1,985.0,course,0.0,view,view.php?id=985,985
1,262137.0,2014-08-25 12:04:40,68582.0,127.0.0.1,985.0,folder,35183.0,view,view.php?id=35183,680
2,262161.0,2014-08-25 12:07:37,68582.0,127.0.0.1,985.0,course,0.0,view,view.php?id=985,985
3,262162.0,2014-08-25 12:07:43,68582.0,127.0.0.1,985.0,folder,35189.0,view,view.php?id=35189,685
4,268473.0,2014-08-26 10:23:43,68582.0,127.0.0.1,985.0,course,0.0,view,view.php?id=985,985
...,...,...,...,...,...,...,...,...,...,...
7886985,46941392.0,2015-07-09 09:06:31,78085.0,127.0.0.1,4873.0,course,0.0,view,view.php?id=4873,4873
7886986,46941411.0,2015-07-09 09:06:50,78085.0,127.0.0.1,4873.0,course,0.0,recent,recent.php?id=4873,4873
7886987,46941443.0,2015-07-09 09:08:09,78085.0,127.0.0.1,4873.0,forum,273852.0,view forum,view.php?id=273852,9459
7886988,46944179.0,2015-07-09 10:56:34,5356.0,127.0.0.1,4873.0,course,0.0,view,view.php?id=4873,4873


In [21]:
support_table

Unnamed: 0,assign_id,courseid,startdate,userid,sup_time,mandatory_status,delivered,assignment_mark
0,588.0,18.0,2014-09-03 22:00:00,17881.0,2014-11-04 16:22:27,1.0,1,1.000000
1,588.0,18.0,2014-09-03 22:00:00,18204.0,2014-11-04 16:22:36,1.0,1,1.000000
2,588.0,18.0,2014-09-03 22:00:00,18541.0,2014-11-04 16:22:45,1.0,1,1.000000
3,588.0,18.0,2014-09-03 22:00:00,22888.0,2014-11-04 16:22:51,1.0,1,1.000000
4,588.0,18.0,2014-09-03 22:00:00,23037.0,2014-11-04 16:22:58,1.0,1,1.000000
...,...,...,...,...,...,...,...,...
221357,275678.0,5909.0,2015-07-04 22:00:00,78349.0,2015-07-28 20:06:44,0.0,1,0.592105
221358,275678.0,5909.0,2015-07-04 22:00:00,78352.0,2015-07-28 20:08:25,0.0,1,0.907895
221359,275678.0,5909.0,2015-07-04 22:00:00,78439.0,2015-07-28 20:11:42,0.0,1,0.355263
221360,275678.0,5909.0,2015-07-04 22:00:00,78442.0,2015-07-28 20:13:11,0.0,1,0.592105


In [22]:
class_list

Unnamed: 0,course,Users per course,Start Date,End Date,Course duration,cuttoff_point
0,1000.0,16,2014-08-31,2015-01-19,140 days 20:15:02.849999872,2014-08-24
1,1002.0,22,2014-09-28,2015-03-04,156 days 11:08:48.800000,2014-09-21
2,1010.0,58,2014-09-11,2014-12-22,101 days 11:07:01.200000,2014-09-04
3,1013.0,76,2014-09-23,2015-02-02,131 days 11:58:50,2014-09-16
4,1020.0,72,2015-01-11,2015-06-04,143 days 18:38:06.100000,2015-01-04
...,...,...,...,...,...,...
557,984.0,14,2014-08-31,2015-01-27,148 days 19:20:00.750000128,2014-08-24
558,985.0,22,2014-08-24,2015-02-25,184 days 09:31:20.900000,2014-08-17
559,992.0,17,2015-02-09,2015-05-12,91 days 09:20:18.500000,2015-02-02
560,993.0,63,2014-09-11,2015-01-12,122 days 21:53:08.800000,2014-09-04


In [23]:
targets_table

Unnamed: 0,courseid,userid,Grade Mandatory,Grade Optional,final_mark
0,1000.0,17391.0,0.500000,,10.000000
1,1000.0,21448.0,0.469512,0.0,4.695122
2,1000.0,21844.0,0.469512,,9.390244
3,1000.0,22429.0,0.451220,,9.024390
4,1000.0,22535.0,0.256098,,5.121951
...,...,...,...,...,...
30505,999.0,74108.0,0.316667,0.0,3.166667
30506,999.0,74173.0,0.311667,0.0,3.116667
30507,999.0,74256.0,0.328333,0.0,3.283333
30508,999.0,8491.0,0.380000,0.0,3.800000
