Importamos las librerias necesarias:

In [33]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import set_config
set_config(display="diagram")

Leemos el CSV:

In [34]:
df = pd.read_csv("student_performance.csv")
print("Tamaño del dataset:", df.shape)

print("\nVista general del dataset:")
display(df.head())

print("\nInformacion general del dataset:")
print(df.info())

Tamaño del dataset: (1044, 17)

Vista general del dataset:


Unnamed: 0,sex,age,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,traveltime,studytime,failures,internet,romantic,goout,Walc,health,approved
0,F,18,GT3,A,4,4,at_home,teacher,2,2,0,no,no,4,1,3,0
1,F,17,GT3,T,1,1,at_home,other,1,2,0,yes,no,3,1,3,0
2,F,15,LE3,T,1,1,at_home,other,1,2,3,yes,no,2,3,3,1
3,F,15,GT3,T,4,2,health,services,1,3,0,yes,yes,2,1,5,1
4,F,16,GT3,T,3,3,other,other,1,2,0,no,no,2,2,5,1



Informacion general del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sex         1044 non-null   object
 1   age         1044 non-null   int64 
 2   famsize     1044 non-null   object
 3   Pstatus     1044 non-null   object
 4   Medu        1044 non-null   int64 
 5   Fedu        1044 non-null   int64 
 6   Mjob        1044 non-null   object
 7   Fjob        1044 non-null   object
 8   traveltime  1044 non-null   int64 
 9   studytime   1044 non-null   int64 
 10  failures    1044 non-null   int64 
 11  internet    1044 non-null   object
 12  romantic    1044 non-null   object
 13  goout       1044 non-null   int64 
 14  Walc        1044 non-null   int64 
 15  health      1044 non-null   int64 
 16  approved    1044 non-null   int64 
dtypes: int64(10), object(7)
memory usage: 138.8+ KB
None


Se separan los datos: 80% para entrenar y 20% para probar

In [35]:
target_col = "approved"
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print(f"Entrenamiento: {X_train.shape}, Prueba: {X_test.shape}")

Entrenamiento: (835, 16), Prueba: (209, 16)


Identificamos las columnas nùmericas y categòricas:

In [36]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numéricas:", num_cols)
print("Categóricas:", cat_cols)

Numéricas: ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'goout', 'Walc', 'health']
Categóricas: ['sex', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'internet', 'romantic']


Preprocesamiento:

In [37]:
numeric_pipeline = Pipeline([("scaler", StandardScaler())])
categorical_pipeline = Pipeline([("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
], remainder="drop")

Se obtienen 5 àrboles de decisiòn con max_depth de 2, 4, 6, 8 y 10, con criterio "gini"

In [38]:
depths = [2,4,6,8,10]
results_gini = []

for d in depths:
    clf = DecisionTreeClassifier(criterion="gini", max_depth=d, random_state=42)
    pipe = Pipeline([("preproc", preprocessor), ("clf", clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results_gini.append({"criterion":"gini", "max_depth": d, "accuracy": acc})
    print(f"[gini] max_depth={d} -> Accuracy: {acc:.4f}")

df_gini = pd.DataFrame(results_gini)
print("\nTabla de accuracies (criterio=\'gini\'):")
display(df_gini.sort_values("max_depth").reset_index(drop=True))

[gini] max_depth=2 -> Accuracy: 0.7799
[gini] max_depth=4 -> Accuracy: 0.7751
[gini] max_depth=6 -> Accuracy: 0.7799
[gini] max_depth=8 -> Accuracy: 0.7608
[gini] max_depth=10 -> Accuracy: 0.7129

Tabla de accuracies (criterio='gini'):


Unnamed: 0,criterion,max_depth,accuracy
0,gini,2,0.779904
1,gini,4,0.77512
2,gini,6,0.779904
3,gini,8,0.760766
4,gini,10,0.712919
