In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import Helpers as hlp

In [None]:
# load data


### Helper Functions

In [8]:
# function for rounding a float to nearest 0.25
# input: float
# output: a variable rounded to the nearest grade according to the Swiss grading scale

def round_to_nearest_grade(x):
    return round(x * 4) / 4

In [10]:
# obtain the corresponding data frames (df) from a given (index_list)

def get_student_data(index_list, df):
    test_df = pd.DataFrame()
    for i in index_list:
        df_temp = df[df['StudentID'] == i]
        test_df = pd.concat([test_df, df_temp])
    return test_df


In [1]:
# function needs to take as an input the dataframe (df) where entries need to be removed
# the number of entries to be removed, needs to be negative 

def remove_entries(df, x):
    prep_df = pd.DataFrame()
    index = df['StudentID'].unique()
    for i in index:
        df_temp = df[df['StudentID'] == i ]
        df_temp.iloc[x:,[3]] = np.NaN
        prep_df = pd.concat([prep_df, df_temp])
    return prep_df


### 1) Splitting into training and test sets

The challenge of splitting the data into training and test sets for the different learning algorithms is to maintain student-course integrity. The pre-implemented method *'train_test_split'* from scikit learn will not necessarily seperate the data such that the course selection per student will remain complete. This is solved by shuffleing and splitting only the 'unique_index_list' and then obtain the data and merge it into a dataframe.

In [None]:
unique_index_list = df['StudentID'].unique()

In [26]:
train_students, test_students = train_test_split(unique_index_list, test_size = 0.2, shuffle = True, random_state = 32)

In [27]:
len(train_students) #248 students for training the different recommendation approaches

248

In [28]:
len(test_students) #62 students for testing the recommendation results

62

In [29]:
train_df = hlp.get_student_data(train_students, df_clean)

In [30]:
train_df.head() #merged all indices that where randomly assigend to be training subjects to a dataframe

Unnamed: 0,StudentID,CourseID,Title,Grades,Grading_Schema
932,77163287,T107004,Introduction aux méthodes de la science politique,5.0,graded
933,77163287,T107008,Séminaire d'introduction aux méthodes de la sc...,4.75,passed
934,77163287,T107005,Introduction à la science politique,4.0,graded
935,77163287,T_J2D034,Droit constitutionnel I,4.25,graded
936,77163287,T107009,Séminaire d'introduction à la science politique,4.75,passed


In [31]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7893 entries, 932 to 7097
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   StudentID       7893 non-null   int64  
 1   CourseID        7893 non-null   object 
 2   Title           7893 non-null   object 
 3   Grades          7893 non-null   float64
 4   Grading_Schema  7893 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 370.0+ KB


In [32]:
len(train_df['StudentID'].unique()) 
# validating that the number of unique indices is the same as the number of training subjects assigned

248

In [33]:
test_df = hlp.get_student_data(test_students, df_clean)

In [34]:
len(test_df['StudentID'].unique())
# validating that the number of unique indices is the same as the number of test subjects assigned

62

In [35]:
train_df.to_csv('obtained_data/train_df.csv', index = False) #save the state of the training data

In [36]:
test_df.to_csv('obtained_data/true_test_df.csv', index = False) #save the state of true test data

# ONLY DO LAST 6
### 3) Prepping the test subjects for offline evaluation

In order to evaluate the different approaches offline, the true test set from the previous step is taken and some entries are removed. As the course selection per students starts with the first course they took and ends with the last course they took before graduating, hence the time progression of the students course selection is respected, one can approximate the in which semester of a test subject might be in, depending on how many courses have already been taken. <br>
<br>
The unique index list of the test set is first shuffled and split in half, one half will have only 6 entries removed, which will represent the recommendations for the last semester and the other half will have 12 entries removed to represent students that have just entered the third semester. This way one can test how well the mandatory workload is captured by the respective algorithms. Recommendations obtained are compared to the true course selection at a later step.

In [37]:
test_history_true = test_df.copy()  
test_ids = test_history_true['StudentID'].unique()
len(test_ids)

62

In [38]:
# split the test data 50/50
test_last_sem, test_third_sem = train_test_split(test_ids, test_size = 0.5, shuffle = True, random_state = 42)

In [39]:
test_df_last_sem = get_student_data(test_last_sem, test_history_true)
len(test_df_last_sem['StudentID'].unique())
# obtain student data for removing 6 entries

31

In [40]:
test_df_third_sem = get_student_data(test_third_sem, test_history_true)
len(test_df_third_sem['StudentID'].unique())
# obtain student data for removing 12 entries 

31

In [41]:
n_entries = -12
n_entries2 = -6
test_df_third_sem_NaN = remove_entries(test_df_third_sem , n_entries)
test_df_last_sem_NaN = remove_entries(test_df_last_sem, n_entries2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

In [42]:
test_df_last_sem_NaN.tail(7) #last 6 are removed

Unnamed: 0,StudentID,CourseID,Title,Grades,Grading_Schema
5605,7710809,T207039,Politiques internationales de l'environnement,5.0,graded
5606,7710809,T205011,Communication et langage,,graded
5607,7710809,T205023,Stratification et mobilité sociale,,graded
5608,7710809,T205004,Sociologie de la consommation,,graded
5609,7710809,T205019,Migrations et relations à l'altérité,,graded
5610,7710809,T208014,Histoire économique et sociale des pays en voi...,,graded
5611,7710809,T_J2P232,Politique extérieure de la Suisse II,,graded


In [43]:
test_df_third_sem_NaN.tail(13) # last 12 courses are removed

Unnamed: 0,StudentID,CourseID,Title,Grades,Grading_Schema
4807,77380063,T207000,Séminaire de préparation du PdR,5.0,passed
4808,77380063,T207033,Projet de recherche en sciences politiques,,graded
4809,77380063,T207012,Comportement électoral,,graded
4810,77380063,T207035,Mobilisation politique,,passed
4811,77380063,T207037,Politique suisse,,graded
4812,77380063,T207036,Politique comparée,,graded
4813,77380063,T207034,Administration et politiques publiques,,graded
4814,77380063,T207059,Politique extérieure de la Suisse,,graded
4815,77380063,T207038,Théorie politique,,graded
4816,77380063,T205000,Sociologie des organisations,,graded


In [44]:
test_set_complete = pd.concat([test_df_third_sem_NaN,test_df_last_sem_NaN])
len(test_set_complete['StudentID'].unique())

62

In [45]:
test_set_complete.to_csv('obtained_data/complete_test_set_entries_removed.csv', index = False) #saving state of the complete test set

In [46]:
test_df_third_sem_NaN.to_csv('obtained_data/test_df_third_sem_NaN.csv', index = False) 
test_df_last_sem_NaN.to_csv('obtained_data/test_df_last_sem_NaN.csv', index = False)
# saving states of test subjects individually