In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('student-por.csv', sep=';')

In [4]:
df.shape

(649, 33)

In [5]:
display(df.head())

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


#### Preprocess

In [6]:
target_column = 'G3'
X = df.drop(columns=[target_column, "address", "G1", "G2"])
y = (df[target_column] >= 10).astype(int)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

In [8]:
# import matplotlib.pyplot as plt

# df.hist(figsize=(20, 15))
# plt.show()

In [9]:
# import seaborn as sns
# sns.boxplot(data=df, x="freetime", y="G3")

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

category_columns = ["sex", "Mjob", "Fjob", "reason","guardian", "schoolsup", "famsup", "paid","activities", "nursery", "higher", "internet", "romantic", ]
scaling_columns = ["age", "Medu", "Fedu", "traveltime", "studytime", "failures", "famrel", "freetime", "goout", "Dalc", "health", "absences",]

transformer = ColumnTransformer([
    ("new", OneHotEncoder(handle_unknown="ignore", sparse_output=False), category_columns),
    ("scaler", StandardScaler(), scaling_columns)
])

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([
    ("transformer", transformer),
    ("classifier", DecisionTreeClassifier())
])



In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "classifier": [DecisionTreeClassifier()],
        "classifier__max_depth": [3, 5, 10, 15, None],
        "classifier__min_samples_leaf": [1, 2, 5, 10]
    },
    {
        "classifier": [RandomForestClassifier()],
        "classifier__n_estimators": [10, 50, 100],
        "classifier__min_samples_leaf": [1, 2, 5, 10]
    },
    {
        "classifier": [GaussianNB()],
        "classifier__var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
    }
]

In [26]:
model_search = GridSearchCV(pipeline, param_grid, return_train_score=True, n_jobs=-1)
model_search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...lassifier())])
,param_grid,"[{'classifier': [DecisionTreeClassifier()], 'classifier__max_depth': [3, 5, ...], 'classifier__min_samples_leaf': [1, 2, ...]}, {'classifier': [RandomForestClassifier()], 'classifier__min_samples_leaf': [1, 2, ...], 'classifier__n_estimators': [10, 50, ...]}, ...]"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,transformers,"[('new', ...), ('scaler', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
model_search.best_params_

{'classifier': RandomForestClassifier(),
 'classifier__min_samples_leaf': 5,
 'classifier__n_estimators': 50}

In [28]:
model_search.best_score_

np.float64(0.851046271735927)