# <center> Pre-processing and training</center>

## Import packages

In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split

## Load the data

We load the dataset from the previous steps, df_student contains the cleaned original dataset. 

In [160]:
df_student = pd.read_csv('./data/data_student_cleaned.csv')

We call the info method to see a summary of the data

In [162]:
df_student.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 34 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      1038 non-null   object
 1   sex         1038 non-null   object
 2   age         1038 non-null   int64 
 3   address     1038 non-null   object
 4   famsize     1038 non-null   object
 5   Pstatus     1038 non-null   object
 6   Medu        1038 non-null   int64 
 7   Fedu        1038 non-null   int64 
 8   Mjob        1038 non-null   object
 9   Fjob        1038 non-null   object
 10  reason      1038 non-null   object
 11  guardian    1038 non-null   object
 12  traveltime  1038 non-null   int64 
 13  studytime   1038 non-null   int64 
 14  failures    1038 non-null   int64 
 15  schoolsup   1038 non-null   object
 16  famsup      1038 non-null   object
 17  paid        1038 non-null   object
 18  activities  1038 non-null   object
 19  nursery     1038 non-null   object
 20  higher  

## Encoding categorical data

Some ordinal features are already encoded as numeric:

- Medu: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education
- Fedu: 0 - none, 1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education
- traveltime: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour
- studytime: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours
- famrel: from 1 - very bad to 5 - excellent
- freetime: from 1 - very low to 5 - very high
- goout: from 1 - very low to 5 - very high
- Dalc: from 1 - very low to 5 - very high
- Walc: from 1 - very low to 5 - very high
- health: from 1 - very bad to 5 - very good

Let's extract the remaining categorical features

In [164]:
df_feature_obj = df_student.select_dtypes(include = ['object'])
df_feature_obj.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,course
0,GP,F,U,GT3,A,at_home,teacher,course,mother,yes,no,no,no,yes,yes,no,no,math
1,GP,F,U,GT3,T,at_home,other,course,father,no,yes,no,no,no,yes,yes,no,math
2,GP,F,U,LE3,T,at_home,other,other,mother,yes,no,yes,no,yes,yes,yes,no,math
3,GP,F,U,GT3,T,health,services,home,mother,no,yes,yes,yes,yes,yes,yes,yes,math
4,GP,F,U,GT3,T,other,other,home,father,no,yes,yes,no,yes,yes,no,no,math


Yes/No variables typed as object will be converted to boolean variables 

In [166]:
bool_features = ['schoolsup','famsup','paid','activities','nursery','higher','internet','romantic']
df_student[bool_features] =  df_student[bool_features].replace({'yes':1,'no':0})
df_student[bool_features].head() 

Unnamed: 0,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,1,0,0,0,1,1,0,0
1,0,1,0,0,0,1,1,0
2,1,0,1,0,1,1,1,0
3,0,1,1,1,1,1,1,1
4,0,1,1,0,1,1,0,0


Features with only 2 levels are present in the df_student DataFrame:


- sex: student's sex ('F' - female or 'M' - male)
- school: student's school ('GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira
- address: student's home address type ('U' - urban or 'R' - rural)
- famsize: family size ('LE3' - less or equal to 3 or 'GT3' - greater than 3)
- Pstatus: parent's cohabitation status ('T' - living together or 'A' - apart)
- course: course ('math' or 'portuguese')

We also encoded these variables using 0/1 values

In [168]:
cols=["sex","address","Pstatus","famsize","school","course"]
df_student[cols] = pd.get_dummies(df_student[cols],drop_first=True).astype(int)
df_student.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,course
0,0,0,18,1,0,0,4,4,at_home,teacher,...,3,4,1,1,3,6,5,6,6,0
1,0,0,17,1,0,1,1,1,at_home,other,...,3,3,1,1,3,4,5,5,6,0
2,0,0,15,1,1,1,1,1,at_home,other,...,3,2,2,3,3,10,7,8,10,0
3,0,0,15,1,0,1,4,2,health,services,...,2,2,1,1,5,2,15,14,15,0
4,0,0,16,1,0,1,3,3,other,other,...,3,2,1,2,5,4,6,10,10,0


In [169]:
df_student.dtypes

school         int32
sex            int32
age            int64
address        int32
famsize        int32
Pstatus        int32
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup      int64
famsup         int64
paid           int64
activities     int64
nursery        int64
higher         int64
internet       int64
romantic       int64
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
course         int32
dtype: object

Four remaining object variables should be encoded:

- Mjob: mother's job ('teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
- Fjob: ather's job ('teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
- reason: reason to choose this school (close to 'home', school 'reputation', 'course' preference or 'other')
- guardian: student's guardian ('mother', 'father' or 'other')

In [171]:
nominals=["Mjob","Fjob","reason","guardian"]
dummy_variable = pd.get_dummies(df_student[nominals],prefix=nominals,drop_first=True).astype(int)
df_student = pd.concat([df_student, dummy_variable], axis=1)
df_student.drop(columns = nominals, axis = 1, inplace=True)
df_student.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,...,Mjob_teacher,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_home,reason_other,reason_reputation,guardian_mother,guardian_other
0,0,0,18,1,0,0,4,4,2,2,...,0,0,0,0,1,0,0,0,1,0
1,0,0,17,1,0,1,1,1,1,2,...,0,0,1,0,0,0,0,0,0,0
2,0,0,15,1,1,1,1,1,1,2,...,0,0,1,0,0,0,1,0,1,0
3,0,0,15,1,0,1,4,2,1,3,...,0,0,0,1,0,1,0,0,1,0
4,0,0,16,1,0,1,3,3,1,2,...,0,0,1,0,0,1,0,0,0,0


In [172]:
df_student.dtypes

school               int32
sex                  int32
age                  int64
address              int32
famsize              int32
Pstatus              int32
Medu                 int64
Fedu                 int64
traveltime           int64
studytime            int64
failures             int64
schoolsup            int64
famsup               int64
paid                 int64
activities           int64
nursery              int64
higher               int64
internet             int64
romantic             int64
famrel               int64
freetime             int64
goout                int64
Dalc                 int64
Walc                 int64
health               int64
absences             int64
G1                   int64
G2                   int64
G3                   int64
course               int32
Mjob_health          int32
Mjob_other           int32
Mjob_services        int32
Mjob_teacher         int32
Fjob_health          int32
Fjob_other           int32
Fjob_services        int32
F

## Feature engineering

We propose to add another feature representing the rate of improvement or decline in grades over the first two years.
this rate is defined as (G2 - G1)/G1


In [174]:
df_student['grade_trend'] = round((df_student['G2'] - df_student['G1'])/df_student['G1'],2)
df_student.drop(columns = ['G1','G2'], inplace = True)
df_student.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,traveltime,studytime,...,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,reason_home,reason_other,reason_reputation,guardian_mother,guardian_other,grade_trend
0,0,0,18,1,0,0,4,4,2,2,...,0,0,0,1,0,0,0,1,0,0.2
1,0,0,17,1,0,1,1,1,1,2,...,0,1,0,0,0,0,0,0,0,0.0
2,0,0,15,1,1,1,1,1,1,2,...,0,1,0,0,0,1,0,1,0,0.14
3,0,0,15,1,0,1,4,2,1,3,...,0,0,1,0,1,0,0,1,0,-0.07
4,0,0,16,1,0,1,3,3,1,2,...,0,1,0,0,1,0,0,0,0,0.67


## Train/Test Split

In [176]:
X = df_student.drop(columns=['G3'])
y = df_student['G3']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=47)

## Scale the data

In [178]:
numeric_features = ['age','failures','absences']
ordinal_features = ['Medu', 'Fedu', 'traveltime', 'studytime', 'famrel','freetime', 'goout', 'Dalc', 'Walc', 'health']

# Initialize  the scalers
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()

# fit and transform
scaler_mm.fit(X_train[ordinal_features])
scaler_std.fit(X_train[numeric_features])
X_train_scaled_num = scaler_std.transform(X_train[numeric_features])
X_train_scaled_ord = scaler_mm.transform(X_train[ordinal_features])
X_train_scaled = np.hstack((X_train_scaled_num,X_train_scaled_ord))


# transform the test data
X_test_scaled_num = scaler_std.transform(X_test[numeric_features])
X_test_scaled_ord = scaler_mm.transform(X_test[ordinal_features])
X_test_scaled = np.hstack((X_test_scaled_num,X_test_scaled_ord))

# Save the new DataFrame

In [180]:
# save the data to a new csv file
df_student.to_csv('./data/data_student_cleaned.csv',index=False)