# Prepare Dataset

### GDrive setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd 'drive/My Drive/Colab Notebooks/4th year project/OULAD dataset/Deadline'

/content/drive/My Drive/Colab Notebooks/4th year project/OULAD dataset/Deadline


In [None]:
ls

[0m[01;34m120[0m/  [01;34m150[0m/  [01;34m180[0m/  [01;34m210[0m/  [01;34m240[0m/  [01;34m270[0m/  [01;34m30[0m/  [01;34m60[0m/  [01;34m90[0m/


## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
assessments_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/assessments.csv')
courses_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/courses.csv')
student_assessments_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/studentAssessment.csv')
student_info_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/studentInfo.csv')
student_registration_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/studentRegistration.csv')
student_vle_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/studentVle.csv')
vle_df = pd.read_csv('../../OULAD dataset/anonymisedOUDataSet/vle.csv')

sa_days_deadline_df = pd.read_csv('../../OULAD dataset/StudentAssessmentDaysDeadline.csv')

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
assessments_df['code_module'].value_counts()

FFF    52
BBB    42
DDD    35
GGG    30
CCC    20
EEE    15
AAA    12
Name: code_module, dtype: int64

In [None]:
assessments_df['code_presentation'].value_counts()

2014B    57
2014J    57
2013J    53
2013B    39
Name: code_presentation, dtype: int64

### Dates for each assessment

#### Course Presentation for AAA
* 2013J
* 2014J

Assessments are the same each year

In [None]:
assessments_df.loc[assessments_df['code_module']=="AAA"].code_presentation.value_counts()

2013J    6
2014J    6
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="AAA") & (assessments_df['code_presentation']=="2013J")].sort_values('date').date

0     19.0
1     54.0
2    117.0
3    166.0
4    215.0
5      NaN
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="AAA") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

6      19.0
7      54.0
8     117.0
9     166.0
10    215.0
11      NaN
Name: date, dtype: float64

#### Course Presentation for BBB
* 2013B
* 2013J
* 2014B
* 2014J


Assessments are the same in 2013 but vary in 2014

In [None]:
assessments_df.loc[assessments_df['code_module']=="BBB"].code_presentation.value_counts()

2013J    12
2014B    12
2013B    12
2014J     6
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="BBB") & (assessments_df['code_presentation']=="2013B")].sort_values('date').date

17     19.0
18     47.0
12     54.0
13     89.0
19     89.0
14    124.0
20    124.0
15    159.0
21    159.0
16    187.0
22    187.0
23      NaN
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="BBB") & (assessments_df['code_presentation']=="2013J")].sort_values('date').date

29     19.0
30     47.0
24     54.0
25     96.0
31     96.0
26    131.0
32    131.0
27    166.0
33    166.0
28    208.0
34    208.0
35      NaN
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="BBB") & (assessments_df['code_presentation']=="2014B")].sort_values('date').date

41     12.0
42     40.0
36     47.0
37     82.0
43     82.0
38    117.0
44    117.0
39    152.0
45    152.0
40    194.0
46    194.0
47      NaN
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="BBB") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

48     19.0
49     54.0
50    110.0
51    152.0
52    201.0
53      NaN
Name: date, dtype: float64

#### Course Presentation for CCC

* 2014B
* 2014J

Assessments are the same each year with a minor variety in dates of assessment

In [None]:
assessments_df.loc[assessments_df['code_module']=="CCC"].code_presentation.value_counts()

2014J    10
2014B    10
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="CCC") & (assessments_df['code_presentation']=="2014B")].sort_values('date').date

54     18.0
58     32.0
55     67.0
59    102.0
56    137.0
60    151.0
61    200.0
57    207.0
62      NaN
63      NaN
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="CCC") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

64     18.0
68     32.0
65     67.0
69    109.0
66    144.0
70    158.0
71    207.0
67    214.0
72      NaN
73      NaN
Name: date, dtype: float64

#### Course Presentation for DDD

* 2013B
* 2013J
* 2014B
* 2014J

Assessments are different for 2013B but the rest of the assessments follow a similar assessment structure with a small variety of dates

In [None]:
assessments_df.loc[assessments_df['code_module']=="DDD"].code_presentation.value_counts()

2013B    14
2013J     7
2014J     7
2014B     7
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="DDD") & (assessments_df['code_presentation']=="2013B")].sort_values('date').date

74     23.0
81     25.0
75     51.0
82     53.0
76     79.0
83     81.0
77    114.0
84    116.0
78    149.0
85    151.0
79    170.0
86    200.0
80    206.0
87    240.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="DDD") & (assessments_df['code_presentation']=="2013J")].sort_values('date').date

88     25.0
89     53.0
90     88.0
91    123.0
92    165.0
93    207.0
94    261.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="DDD") & (assessments_df['code_presentation']=="2014B")].sort_values('date').date

95      25.0
96      53.0
97      74.0
98     116.0
99     158.0
100    200.0
101    241.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="DDD") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

102     20.0
103     41.0
104     62.0
105    111.0
106    146.0
107    195.0
108      NaN
Name: date, dtype: float64

#### Course Presentation for EEE

* 2013J
* 2014B
* 2014J

Assessments are the same each year with a small variety of dates

In [None]:
assessments_df.loc[assessments_df['code_module']=="EEE"].code_presentation.value_counts()

2013J    5
2014J    5
2014B    5
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="EEE") & (assessments_df['code_presentation']=="2013J")].sort_values('date').date

109     33.0
110     68.0
111    124.0
112    159.0
113    235.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="EEE") & (assessments_df['code_presentation']=="2014B")].sort_values('date').date

114     33.0
115     68.0
116    117.0
117    152.0
118    228.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="EEE") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

119     33.0
120     68.0
121    131.0
122    166.0
123    235.0
Name: date, dtype: float64

#### Course Presentation for FFF

* 2013B
* 2013J
* 2014B
* 2014J

Same assessmen structure with a slight change of dates year on year

In [None]:
assessments_df.loc[assessments_df['code_module']=="FFF"].code_presentation.value_counts()

2013J    13
2014J    13
2014B    13
2013B    13
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="FFF") & (assessments_df['code_presentation']=="2013B")].sort_values('date').date

131     19.0
132     47.0
133     89.0
134    131.0
135    166.0
124    222.0
125    222.0
126    222.0
127    222.0
128    222.0
129    222.0
130    222.0
136    222.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="FFF") & (assessments_df['code_presentation']=="2013J")].sort_values('date').date

144     19.0
145     47.0
146     96.0
147    131.0
148    173.0
137    236.0
138    236.0
139    236.0
140    236.0
141    236.0
142    236.0
143    236.0
149    236.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="FFF") & (assessments_df['code_presentation']=="2014B")].sort_values('date').date

157     24.0
158     52.0
159     87.0
160    129.0
161    171.0
150    227.0
151    227.0
152    227.0
153    227.0
154    227.0
155    227.0
156    227.0
162    227.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="FFF") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

170     24.0
171     52.0
172     94.0
173    136.0
174    199.0
163    241.0
164    241.0
165    241.0
166    241.0
167    241.0
168    241.0
169    241.0
175    241.0
Name: date, dtype: float64

#### Course Presentation for GGG

* 2013J
* 2014B
* 2014J

Same assessmen structure with a slight change of dates year on year

In [None]:
assessments_df.loc[assessments_df['code_module']=="GGG"].code_presentation.value_counts()

2013J    10
2014J    10
2014B    10
Name: code_presentation, dtype: int64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="GGG") & (assessments_df['code_presentation']=="2013J")].sort_values('date').date

182     61.0
183    124.0
184    173.0
176    229.0
177    229.0
178    229.0
179    229.0
180    229.0
181    229.0
185    229.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="GGG") & (assessments_df['code_presentation']=="2014B")].sort_values('date').date

192     61.0
193    117.0
194    166.0
186    222.0
187    222.0
188    222.0
189    222.0
190    222.0
191    222.0
195    222.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['code_module']=="GGG") & (assessments_df['code_presentation']=="2014J")].sort_values('date').date

202     61.0
203    124.0
204    173.0
196    229.0
197    229.0
198    229.0
199    229.0
200    229.0
201    229.0
205    229.0
Name: date, dtype: float64

In [None]:
assessments_df.loc[(assessments_df['assessment_type'] != 'Exam')]

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0
...,...,...,...,...,...,...
200,GGG,2014J,37442,CMA,229.0,0.0
201,GGG,2014J,37443,CMA,229.0,0.0
202,GGG,2014J,37435,TMA,61.0,0.0
203,GGG,2014J,37436,TMA,124.0,0.0


### Merge other_df and all other student_info

In [None]:
def merge_df(student_info_df, other_df, on, how='inner'):
  student_all_info = pd.merge(student_info_df,other_df,on=on, how=how)

  return student_all_info

### Join assessment data with individual student assessments

In [None]:
def get_score_data(assessments_df, student_assessments_df, score_deadline):
  # drop columns not required to join with student_assessments_df
  assessments_df.drop(['weight'], axis = 1, inplace = True)
  assessments_df.code_module.value_counts()

  # merge each student assessments with assessment data
  combined_assessment = pd.merge(student_assessments_df,assessments_df,on='id_assessment')
  combined_assessment.drop(['is_banked','date_submitted'],axis = 1,inplace=True)

  # Get all assessments which are not the final exam and happened before a certain date
  combined_assessment = combined_assessment[(combined_assessment['assessment_type'] != 'Exam') & (combined_assessment['date'] <= score_deadline)]

  grouped_student_assessments = combined_assessment.groupby(['id_student','code_module','code_presentation']).mean()
  grouped_student_assessments.drop(['id_assessment', 'date'],axis=1,inplace = True)

  return grouped_student_assessments

### Get the number of clicks by a deadline

In [None]:
# click data
def inner_merge(left_df,right_df,right_cols,on_cols):
    right_df = right_df[right_cols]
    left_df = left_df.merge(right_df, on = on_cols)
    return left_df.drop_duplicates()


def get_click_data(student_vle_df,vle_df,click_deadline):

  def clicks_from(clicks, deadline):
    temp = clicks[clicks['date'] <= deadline]
    temp = temp.drop('date', axis = 1)
    temp = temp.groupby(['code_module','code_presentation','id_student','activity_type']).mean()
    temp = temp.rename(columns = {'sum_click': f'sum_click{deadline} mean'})
    temp = temp.reset_index() # this fills out the missing columns for merging later 
    return temp

  clicks = inner_merge(student_vle_df,
                        vle_df,
                        ['id_site','code_module','code_presentation','activity_type'],
                        ['id_site','code_module','code_presentation'],
                      )

  clicks = clicks.drop(['id_site'], axis = 1)

  #clicks = clicks.groupby(['code_module','code_presentation','id_student','activity_type']).sum().reset_index()

  sum_click_df = pd.pivot_table(data = clicks_from(clicks,click_deadline), 
                              index = ['code_module','code_presentation','id_student'],
                              columns = 'activity_type', 
                              values = [f'sum_click{click_deadline} mean'],
                              fill_value = 0,
                              ).reset_index()

  # get rid of multi index
  sum_click_df = pd.concat([sum_click_df['code_module'],
                            sum_click_df['code_presentation'],
                            sum_click_df['id_student'], 
                            sum_click_df[f'sum_click{click_deadline} mean']], axis=1)

  temp = sum_click_df.groupby(['code_module','code_presentation','id_student']).mean(numeric_only=True)

  temp2 = pd.DataFrame()
  temp2['mean_click'] = temp.mean(axis=1)
  sum_click_df = pd.merge(sum_click_df,temp2,on = ['id_student','code_module','code_presentation'],how='left')

  return sum_click_df

In [None]:
def tidy_up_dataset(student_all_info):
  # remove rows with null
  student_all_info.dropna(inplace=True)

  # replace final result string with numerical representation
  student_all_info.final_result = student_all_info.final_result.replace({'Withdrawn':0,'Fail':1,'Pass':2,'Distinction':3})

  # remove withdrawnn students
  student_all_info = student_all_info[student_all_info.final_result != 0]

  return student_all_info

In [None]:
import os

deadline_list = [30,60,90,120,150,180,210,240,270]

for DEADLINE in deadline_list:
  if not os.path.exists(f'{DEADLINE}'):
    os.makedirs(f'{DEADLINE}')

  score_df = get_score_data(assessments_df.copy(), student_assessments_df.copy(), DEADLINE)
  student_all_info = merge_df(student_info_df.copy(), score_df.copy(), on='id_student')

  sum_click_df = get_click_data(student_vle_df.copy(),vle_df.copy(),DEADLINE)
  student_all_info = merge_df(student_info_df.copy(), sum_click_df.copy(), on = ['id_student','code_module','code_presentation'],how='left')

  student_all_info = tidy_up_dataset(student_all_info.copy())

  student_all_info.to_csv(f'{DEADLINE}/oulad_dataset_combined_{DEADLINE}.csv', index=False)