In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [16]:
df = pd.read_csv("../data/student-mat.csv", sep=";")

df.shape, df.head()


((395, 33),
   school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  ...  \
 0     GP   F   18       U     GT3       A     4     4  at_home   teacher  ...   
 1     GP   F   17       U     GT3       T     1     1  at_home     other  ...   
 2     GP   F   15       U     LE3       T     1     1  at_home     other  ...   
 3     GP   F   15       U     GT3       T     4     2   health  services  ...   
 4     GP   F   16       U     GT3       T     3     3    other     other  ...   
 
   famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
 0      4        3      4     1     1      3        6   5   6   6  
 1      5        3      3     1     1      3        4   5   5   6  
 2      4        3      2     2     3      3       10   7   8  10  
 3      3        2      2     1     1      5        2  15  14  15  
 4      4        3      2     1     2      5        4   6  10  10  
 
 [5 rows x 33 columns])

In [17]:
y1 = df["G1"]
y2 = df["G2"]
y3 = (df["G3"] / 20) * 10

y1.head(), y2.head(), y3.head()


(0     5
 1     5
 2     7
 3    15
 4     6
 Name: G1, dtype: int64,
 0     6
 1     5
 2     8
 3    14
 4    10
 Name: G2, dtype: int64,
 0    3.0
 1    3.0
 2    5.0
 3    7.5
 4    5.0
 Name: G3, dtype: float64)

In [18]:
X_base = df.drop(columns=["G1", "G2", "G3"])
X_g1 = df.drop(columns=["G2", "G3"])
X_g2 = df.drop(columns=["G3"])

X_base.shape, X_g1.shape, X_g2.shape

((395, 30), (395, 31), (395, 32))

In [19]:
categorical_cols = [
    "school","sex","address","famsize","Pstatus",
    "Mjob","Fjob","reason","guardian",
    "schoolsup","famsup","paid","activities",
    "nursery","higher","internet","romantic"
]

In [20]:
numeric_cols_stage1 = [
    "age","Medu","Fedu","traveltime","studytime","failures",
    "famrel","freetime","goout","Dalc","Walc","health","absences"
]

preprocessor_stage1 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols_stage1)
    ]
)

In [21]:
numeric_cols_stage2 = numeric_cols_stage1 + ["G1"]

preprocessor_stage2 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols_stage2)
    ]
)

In [22]:
numeric_cols_stage3 = numeric_cols_stage2 + ["G2"]

preprocessor_stage3 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols_stage3)
    ]
)

In [23]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X_base, y1, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_g1, y2, test_size=0.2, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X_g2, y3, test_size=0.2, random_state=42)

In [24]:
g1_model = Pipeline([
    ("prep", preprocessor_stage1),
    ("model", LinearRegression())
])

g1_model.fit(X1_train, y1_train)


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [25]:
g2_model = Pipeline([
    ("prep", preprocessor_stage2),
    ("model", LinearRegression())
])

g2_model.fit(X2_train, y2_train)


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [26]:
g3_model = Pipeline([
    ("prep", preprocessor_stage3),
    ("model", LinearRegression())
])

g3_model.fit(X3_train, y3_train)


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [27]:
mean_absolute_error(y3_test, g3_model.predict(X3_test)), r2_score(y3_test, g3_model.predict(X3_test))

(0.8233328098573761, 0.7241341236974019)

In [28]:
import os, pickle
os.makedirs("../models", exist_ok=True)

pickle.dump(g1_model, open("../models/g1_model.pkl","wb"))
pickle.dump(g2_model, open("../models/g2_model.pkl","wb"))
pickle.dump(g3_model, open("../models/g3_model.pkl","wb"))
