## Feature Engineering and Model Training Pipeline for Combined Datasets

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../../data/interim/student-full-encoded.csv")
df.head()

Unnamed: 0,absences,studytime,freetime,health,goout,Dalc,Walc,higher_yes,schoolsup_yes,romantic_yes,...,Mjob_other,Mjob_services,Mjob_teacher,reason_home,reason_other,reason_reputation,internet_yes,sex_M,subject_por,at_risk
0,6,2,3,3,4,1,1,True,True,False,...,False,False,False,False,False,False,False,False,False,1.0
1,4,2,3,3,3,1,1,True,False,False,...,False,False,False,False,False,False,True,False,False,1.0
2,10,2,3,3,2,2,3,True,True,False,...,False,False,False,False,True,False,True,False,False,0.0
3,2,3,2,5,2,1,1,True,False,True,...,False,False,False,True,False,False,True,False,False,0.0
4,4,2,3,5,2,1,2,True,False,False,...,True,False,False,True,False,False,False,False,False,0.0


In [3]:
df.info()
df["at_risk"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 662 entries, 0 to 661
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   absences           662 non-null    int64  
 1   studytime          662 non-null    int64  
 2   freetime           662 non-null    int64  
 3   health             662 non-null    int64  
 4   goout              662 non-null    int64  
 5   Dalc               662 non-null    int64  
 6   Walc               662 non-null    int64  
 7   higher_yes         662 non-null    bool   
 8   schoolsup_yes      662 non-null    bool   
 9   romantic_yes       662 non-null    bool   
 10  paid_yes           662 non-null    bool   
 11  guardian_mother    662 non-null    bool   
 12  guardian_other     662 non-null    bool   
 13  Mjob_health        662 non-null    bool   
 14  Mjob_other         662 non-null    bool   
 15  Mjob_services      662 non-null    bool   
 16  Mjob_teacher       662 non

at_risk
0.0    285
1.0    137
Name: count, dtype: int64

### Binning Numeric Values

In [4]:
df["absences_bin"] = pd.cut(df["absences"], bins=[-1, 0, 3, 10, 50], labels=["none", "low", "medium", "high"])
df["goout_binned"] = pd.cut(df["goout"], bins=[-1, 2, 4, 5], labels=["low", "medium", "high"])

Alcohol Score

In [5]:
df["alc_total"] = df["Dalc"] + df["Walc"]

In [6]:
df["alc_level"] = pd.cut(df["alc_total"], bins=[0, 3, 5, 10], labels=["low", "moderate", "high"])

Interaction features

In [7]:
df["study_absence_ratio"] = df["studytime"] / (df["absences"] + 1)
df["alc_goout"] = df["alc_total"] * df["goout"]

### One hot Encoding Categorical Variables

In [9]:
df = pd.get_dummies(df, columns=["absences_bin", "goout_binned", "alc_level"], drop_first=True)

In [10]:
df.head()

Unnamed: 0,absences,studytime,freetime,health,goout,Dalc,Walc,higher_yes,schoolsup_yes,romantic_yes,...,alc_total,study_absence_ratio,alc_goout,absences_bin_low,absences_bin_medium,absences_bin_high,goout_binned_medium,goout_binned_high,alc_level_moderate,alc_level_high
0,6,2,3,3,4,1,1,True,True,False,...,2,0.285714,8,False,True,False,True,False,False,False
1,4,2,3,3,3,1,1,True,False,False,...,2,0.4,6,False,True,False,True,False,False,False
2,10,2,3,3,2,2,3,True,True,False,...,5,0.181818,10,False,True,False,False,False,True,False
3,2,3,2,5,2,1,1,True,False,True,...,2,1.0,4,True,False,False,False,False,False,False
4,4,2,3,5,2,1,2,True,False,False,...,3,0.4,6,False,True,False,False,False,False,False


### Save the engineered dataset

In [11]:
df.to_csv("../../data/interim/student-final-engineered.csv", index=False)