In [1]:
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display, Markdown

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    ConfusionMatrixDisplay, RocCurveDisplay
)
from sklearn.inspection import permutation_importance


In [2]:
display(Markdown("# HW06 — Деревья решений и ансамбли"))


# HW06 — Деревья решений и ансамбли

In [4]:
DATA_PATH = "S06-hw-dataset-04.csv"

df = pd.read_csv("S06-hw-dataset-04.csv")

display(Markdown("## Данные: первичный анализ"))
display(df.head())
df.info()


## Данные: первичный анализ

Unnamed: 0,id,f01,f02,f03,f04,f05,f06,f07,f08,f09,...,f52,f53,f54,f55,f56,f57,f58,f59,f60,target
0,1,-1.25021,1.423474,-0.225004,-4.023138,-0.832729,-0.550874,1.77209,2.76169,-0.69875,...,10.938269,0.501178,1.600001,0.314212,1.209735,1.355697,-5.338924,1.153944,-0.153934,0
1,2,0.074328,0.376429,0.212831,-0.502074,2.017405,0.625496,1.943785,1.24203,-0.52409,...,7.775262,-4.550195,6.272586,-0.932162,-0.228543,1.73522,-3.827828,0.292165,0.27372,0
2,3,0.638481,0.060968,0.74676,2.479653,-0.292858,-0.078139,-2.918423,-0.013186,1.009135,...,-4.448447,-9.593179,-3.093519,0.029321,0.605511,0.829103,-0.085985,2.891408,0.766221,0
3,4,1.712916,-1.350969,-0.256473,1.622074,-0.445141,0.911932,-3.440345,1.505192,-1.104348,...,-1.619072,-3.237479,-5.474038,-1.582475,0.198137,3.823409,0.880395,1.14861,0.136732,0
4,5,0.905676,-0.206545,-0.068806,4.086026,-1.010045,-0.772644,-4.207688,2.506104,1.589143,...,-2.396844,-10.540129,-5.532811,-1.231203,0.000119,4.298572,-1.558235,0.924673,0.111668,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 62 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      25000 non-null  int64  
 1   f01     25000 non-null  float64
 2   f02     25000 non-null  float64
 3   f03     25000 non-null  float64
 4   f04     25000 non-null  float64
 5   f05     25000 non-null  float64
 6   f06     25000 non-null  float64
 7   f07     25000 non-null  float64
 8   f08     25000 non-null  float64
 9   f09     25000 non-null  float64
 10  f10     25000 non-null  float64
 11  f11     25000 non-null  float64
 12  f12     25000 non-null  float64
 13  f13     25000 non-null  float64
 14  f14     25000 non-null  float64
 15  f15     25000 non-null  float64
 16  f16     25000 non-null  float64
 17  f17     25000 non-null  float64
 18  f18     25000 non-null  float64
 19  f19     25000 non-null  float64
 20  f20     25000 non-null  float64
 21  f21     25000 non-null  float64
 22

In [5]:
display(Markdown("### Базовые статистики"))
df.describe()


### Базовые статистики

Unnamed: 0,id,f01,f02,f03,f04,f05,f06,f07,f08,f09,...,f52,f53,f54,f55,f56,f57,f58,f59,f60,target
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,12500.5,-0.000386,-0.004872,0.003202,0.335329,-0.000563,-0.010118,0.001182,0.957385,-0.004658,...,-0.166251,-3.387506,1.749793,-0.013017,-0.001383,0.893365,-0.909479,0.00057,-0.000754,0.0492
std,7217.022701,1.001623,0.995606,1.004367,3.207537,0.993965,1.002172,2.432162,1.830223,1.01347,...,4.234741,4.331576,5.31866,1.001594,0.996409,2.445185,1.962618,0.99432,0.997167,0.21629
min,1.0,-4.370993,-4.087073,-4.103875,-13.249937,-4.118778,-3.895974,-8.883224,-8.132548,-4.068933,...,-20.021141,-18.33229,-20.336666,-4.349216,-4.119472,-9.508509,-7.919287,-4.038312,-3.812255,0.0
25%,6250.75,-0.680165,-0.6751,-0.675426,-1.750048,-0.669764,-0.674374,-1.647977,-0.21726,-0.688278,...,-2.897904,-6.278403,-1.775889,-0.689962,-0.676191,-0.735473,-2.226959,-0.666367,-0.665861,0.0
50%,12500.5,0.001859,-0.000247,0.013272,0.403483,-0.001309,-0.005994,-0.011349,0.963009,0.000414,...,-0.396946,-3.462072,1.931851,-0.020933,-0.004193,0.888535,-0.923354,0.004381,0.00242,0.0
75%,18750.25,0.679702,0.659523,0.683437,2.486453,0.672299,0.652629,1.65868,2.167758,0.68104,...,2.344956,-0.57854,5.473886,0.6613,0.673722,2.51679,0.395648,0.666474,0.665918,0.0
max,25000.0,4.208888,3.984564,3.793442,15.28825,4.020733,4.279607,9.538525,9.321099,4.261349,...,20.717964,18.818764,20.688069,4.338337,3.902131,11.880651,6.77898,3.834922,4.012639,1.0


In [6]:
display(Markdown("### Распределение таргета"))
target_dist = df["target"].value_counts(normalize=True)
display(target_dist)


### Распределение таргета

target
0    0.9508
1    0.0492
Name: proportion, dtype: float64

In [10]:
X = df.drop(columns=["target", "id"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [11]:

display(Markdown("""
## Train / Test split

- Фиксированный `random_state` нужен для воспроизводимости  
- `stratify=y` сохраняет дисбаланс классов
"""))



## Train / Test split

- Фиксированный `random_state` нужен для воспроизводимости  
- `stratify=y` сохраняет дисбаланс классов


In [12]:
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)

y_pred = dummy.predict(X_test)

dummy_metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred)
}

dummy_metrics


{'accuracy': 0.95088, 'f1': 0.0}

In [13]:
logreg_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

logreg_pipe.fit(X_train, y_train)

y_pred = logreg_pipe.predict(X_test)
y_proba = logreg_pipe.predict_proba(X_test)[:, 1]

logreg_metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_proba)
}

logreg_metrics


{'accuracy': 0.7792, 'f1': 0.2572658772874058, 'roc_auc': 0.8418608704517014}

In [14]:
dt = DecisionTreeClassifier(random_state=42, class_weight="balanced")

dt_params = {
    "max_depth": [3, 5, 7, None],
    "min_samples_leaf": [1, 5, 20]
}

dt_search = GridSearchCV(
    dt,
    dt_params,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

dt_search.fit(X_train, y_train)
dt_best = dt_search.best_estimator_

dt_search.best_params_


{'max_depth': 5, 'min_samples_leaf': 20}

In [15]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

rf_params = {
    "max_depth": [5, 10, None],
    "min_samples_leaf": [1, 5]
}

rf_search = GridSearchCV(
    rf,
    rf_params,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_

rf_search.best_params_


{'max_depth': None, 'min_samples_leaf': 1}

In [16]:
gb = GradientBoostingClassifier(random_state=42)

gb_params = {
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5],
    "n_estimators": [100, 200]
}

gb_search = GridSearchCV(
    gb,
    gb_params,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

gb_search.fit(X_train, y_train)
gb_best = gb_search.best_estimator_

gb_search.best_params_


{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}

In [17]:
def evaluate(model):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba)
    }

results = {
    "dummy": dummy_metrics,
    "logreg": logreg_metrics,
    "decision_tree": evaluate(dt_best),
    "random_forest": evaluate(rf_best),
    "gradient_boosting": evaluate(gb_best),
}

results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,accuracy,f1,roc_auc
dummy,0.95088,0.0,
logreg,0.7792,0.257266,0.841861
decision_tree,0.88,0.370805,0.821813
random_forest,0.96832,0.524038,0.905668
gradient_boosting,0.97696,0.708502,0.900249


In [18]:
best_model_name = results_df["roc_auc"].idxmax()
best_model = {
    "decision_tree": dt_best,
    "random_forest": rf_best,
    "gradient_boosting": gb_best,
}[best_model_name]

display(Markdown(f"## Лучшая модель: **{best_model_name}**"))


## Лучшая модель: **random_forest**

In [19]:
perm = permutation_importance(
    best_model, X_test, y_test,
    n_repeats=10,
    random_state=42,
    scoring="roc_auc"
)

imp = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)
imp.head(10)


f54    0.011751
f53    0.007202
f25    0.006406
f04    0.005969
f58    0.004972
f33    0.004292
f41    0.004275
f38    0.003660
f50    0.003257
f27    0.003146
dtype: float64