In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)


In [2]:
walmart_path = "dataset/Walmart_Sales.csv"
spotify_path = "dataset/spotify_churn_dataset.csv"

df_wm = pd.read_csv(walmart_path)
df_sp = pd.read_csv(spotify_path)

print("Walmart:", df_wm.shape)
display(df_wm.head())

print("\nSpotify:", df_sp.shape)
display(df_sp.head())


Walmart: (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106



Spotify: (8000, 12)


Unnamed: 0,user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
0,1,Female,54,CA,Free,26,23,0.2,Desktop,31,0,1
1,2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
2,3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
3,4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
4,5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1


In [3]:
wm_target_candidates = ["Weekly_Sales", "weekly_sales", "Sales", "sales", "Target", "target"]
wm_target = next((c for c in wm_target_candidates if c in df_wm.columns), None)

sp_target_candidates = ["churn", "Churn", "is_churn", "IsChurn", "label", "Label", "target", "Target", "y", "is_churned"]
sp_target = next((c for c in sp_target_candidates if c in df_sp.columns), None)

print("Detected Walmart target:", wm_target)
print("Detected Spotify target:", sp_target)

if wm_target is None:
    raise ValueError("Не нашёл target в Walmart_Sales.csv. Укажи wm_target вручную.")
if sp_target is None:
    raise ValueError("Не нашёл target в spotify_churn_dataset.csv. Укажи sp_target вручную.")


Detected Walmart target: Weekly_Sales
Detected Spotify target: is_churned


In [4]:
df_wm_reg = df_wm.copy()

date_col_candidates = ["Date", "date", "DATE"]
date_col = next((c for c in date_col_candidates if c in df_wm_reg.columns), None)

if date_col is not None:
    df_wm_reg[date_col] = pd.to_datetime(df_wm_reg[date_col], errors="coerce")
    df_wm_reg["Year"] = df_wm_reg[date_col].dt.year
    df_wm_reg["Month"] = df_wm_reg[date_col].dt.month
    df_wm_reg["WeekOfYear"] = df_wm_reg[date_col].dt.isocalendar().week.astype(float)
    df_wm_reg["Day"] = df_wm_reg[date_col].dt.day
    df_wm_reg = df_wm_reg.drop(columns=[date_col])

X_wm = df_wm_reg.drop(columns=[wm_target])
y_wm = df_wm_reg[wm_target].astype(float)

Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_wm, y_wm, test_size=0.2, random_state=RANDOM_STATE
)

num_cols_wm = Xw_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_wm = [c for c in Xw_train.columns if c not in num_cols_wm]

print("Train:", Xw_train.shape, "Test:", Xw_test.shape)
print("Numeric:", len(num_cols_wm), "Categorical:", len(cat_cols_wm), cat_cols_wm[:10])


Train: (5148, 10) Test: (1287, 10)
Numeric: 10 Categorical: 0 []


In [5]:
preprocess_wm = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols_wm),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_wm),
    ],
    remainder="drop"
)

wm_gb_base = Pipeline([
    ("prep", preprocess_wm),
    ("model", GradientBoostingRegressor(random_state=RANDOM_STATE))
])

wm_gb_base.fit(Xw_train, yw_train)
yw_pred_gb_base = wm_gb_base.predict(Xw_test)

mae_gb_base = mean_absolute_error(yw_test, yw_pred_gb_base)
rmse_gb_base = np.sqrt(mean_squared_error(yw_test, yw_pred_gb_base))
r2_gb_base = r2_score(yw_test, yw_pred_gb_base)

print("=== Walmart GradientBoostingRegressor baseline ===")
print(f"MAE : {mae_gb_base:.4f}")
print(f"RMSE: {rmse_gb_base:.4f}")
print(f"R^2 : {r2_gb_base:.4f}")


=== Walmart GradientBoostingRegressor baseline ===
MAE : 133825.6988
RMSE: 194819.0911
R^2 : 0.8822


In [7]:
wm_gb_pipe = Pipeline([
    ("prep", preprocess_wm),
    ("model", GradientBoostingRegressor(random_state=RANDOM_STATE))
])

param_grid_wm_gb = {
    "model__n_estimators": [200, 400],
    "model__learning_rate": [0.03, 0.1],
    "model__max_depth": [2, 3, 5],
    "model__subsample": [0.7, 1.0]
}

cv_wm = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_wm_gb = GridSearchCV(
    estimator=wm_gb_pipe,
    param_grid=param_grid_wm_gb,
    scoring="neg_mean_squared_error",
    cv=cv_wm,
    n_jobs=-1
)

gs_wm_gb.fit(Xw_train, yw_train)
print("Best params:", gs_wm_gb.best_params_)
print("Best CV MSE:", -gs_wm_gb.best_score_)


Best params: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 400, 'model__subsample': 1.0}
Best CV MSE: 14335824872.376583


In [8]:
wm_gb_best = gs_wm_gb.best_estimator_
yw_pred_gb_imp = wm_gb_best.predict(Xw_test)

mae_gb_imp = mean_absolute_error(yw_test, yw_pred_gb_imp)
rmse_gb_imp = np.sqrt(mean_squared_error(yw_test, yw_pred_gb_imp))
r2_gb_imp = r2_score(yw_test, yw_pred_gb_imp)

print("=== Walmart GradientBoostingRegressor improved ===")
print(f"MAE : {mae_gb_imp:.4f}")
print(f"RMSE: {rmse_gb_imp:.4f}")
print(f"R^2 : {r2_gb_imp:.4f}")

print("\n=== Walmart comparison ===")
print(f"Baseline RMSE={rmse_gb_base:.4f}, R^2={r2_gb_base:.4f}")
print(f"Improved RMSE={rmse_gb_imp:.4f}, R^2={r2_gb_imp:.4f}")


=== Walmart GradientBoostingRegressor improved ===
MAE : 63182.0978
RMSE: 118855.8977
R^2 : 0.9561

=== Walmart comparison ===
Baseline RMSE=194819.0911, R^2=0.8822
Improved RMSE=118855.8977, R^2=0.9561


In [9]:
X_sp = df_sp.drop(columns=[sp_target])
y_sp = df_sp[sp_target]

if y_sp.dtype == "object":
    y_map = y_sp.astype(str).str.lower().map({"yes": 1, "no": 0, "true": 1, "false": 0})
    if y_map.isna().any():
        y_sp, classes = pd.factorize(y_sp)
        print("Target factorized:", list(classes))
    else:
        y_sp = y_map

Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X_sp, y_sp, test_size=0.2, random_state=RANDOM_STATE, stratify=y_sp
)

num_cols_sp = Xs_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_sp = [c for c in Xs_train.columns if c not in num_cols_sp]

print("Train:", Xs_train.shape, "Test:", Xs_test.shape)
print("Numeric:", len(num_cols_sp), "Categorical:", len(cat_cols_sp), cat_cols_sp[:10])


Train: (6400, 11) Test: (1600, 11)
Numeric: 7 Categorical: 4 ['gender', 'country', 'subscription_type', 'device_type']


In [10]:
preprocess_sp = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols_sp),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_sp),
    ],
    remainder="drop"
)

sp_gb_base = Pipeline([
    ("prep", preprocess_sp),
    ("model", GradientBoostingClassifier(random_state=RANDOM_STATE))
])

sp_gb_base.fit(Xs_train, ys_train)
ys_pred_base = sp_gb_base.predict(Xs_test)
ys_proba_base = sp_gb_base.predict_proba(Xs_test)[:, 1]

acc_base = accuracy_score(ys_test, ys_pred_base)
prec_base = precision_score(ys_test, ys_pred_base, zero_division=0)
rec_base = recall_score(ys_test, ys_pred_base, zero_division=0)
f1_base = f1_score(ys_test, ys_pred_base, zero_division=0)
auc_base = roc_auc_score(ys_test, ys_proba_base)

print("=== Spotify GradientBoostingClassifier baseline ===")
print(f"Accuracy : {acc_base:.4f}")
print(f"Precision: {prec_base:.4f}")
print(f"Recall   : {rec_base:.4f}")
print(f"F1-score : {f1_base:.4f}")
print(f"ROC-AUC  : {auc_base:.4f}")


=== Spotify GradientBoostingClassifier baseline ===
Accuracy : 0.7388
Precision: 0.1667
Recall   : 0.0024
F1-score : 0.0048
ROC-AUC  : 0.5018


Гипотезы:

больше деревьев и меньший learning_rate обычно дают лучший результат

subsample < 1 снижает переобучение

глубина слабых деревьев регулирует сложность

In [11]:
sp_gb_pipe = Pipeline([
    ("prep", preprocess_sp),
    ("model", GradientBoostingClassifier(random_state=RANDOM_STATE))
])

param_grid_sp_gb = {
    "model__n_estimators": [200, 400],
    "model__learning_rate": [0.03, 0.1],
    "model__max_depth": [2, 3],
    "model__subsample": [0.7, 1.0]
}

cv_sp = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_sp_gb = GridSearchCV(
    estimator=sp_gb_pipe,
    param_grid=param_grid_sp_gb,
    scoring="f1",
    cv=cv_sp,
    n_jobs=-1
)

gs_sp_gb.fit(Xs_train, ys_train)
print("Best params:", gs_sp_gb.best_params_)
print("Best CV F1:", gs_sp_gb.best_score_)


Best params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.7}
Best CV F1: 0.05944242580834082


In [12]:
sp_gb_best = gs_sp_gb.best_estimator_

ys_pred_imp = sp_gb_best.predict(Xs_test)
ys_proba_imp = sp_gb_best.predict_proba(Xs_test)[:, 1]

acc_imp = accuracy_score(ys_test, ys_pred_imp)
prec_imp = precision_score(ys_test, ys_pred_imp, zero_division=0)
rec_imp = recall_score(ys_test, ys_pred_imp, zero_division=0)
f1_imp = f1_score(ys_test, ys_pred_imp, zero_division=0)
auc_imp = roc_auc_score(ys_test, ys_proba_imp)

print("=== Spotify GradientBoostingClassifier improved ===")
print(f"Accuracy : {acc_imp:.4f}")
print(f"Precision: {prec_imp:.4f}")
print(f"Recall   : {rec_imp:.4f}")
print(f"F1-score : {f1_imp:.4f}")
print(f"ROC-AUC  : {auc_imp:.4f}")

print("\n=== Spotify comparison ===")
print(f"Baseline F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"Improved F1={f1_imp:.4f}, AUC={auc_imp:.4f}")


=== Spotify GradientBoostingClassifier improved ===
Accuracy : 0.7281
Precision: 0.2162
Recall   : 0.0193
F1-score : 0.0355
ROC-AUC  : 0.5026

=== Spotify comparison ===
Baseline F1=0.0048, AUC=0.5018
Improved F1=0.0355, AUC=0.5026


In [13]:
class GradientBoostingRegressorScratch:
    def __init__(self, n_estimators=200, learning_rate=0.1, max_depth=2, random_state=42):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.random_state = random_state
        self.init_ = 0.0
        self.models_ = []

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)

        rng = np.random.RandomState(self.random_state)

        self.init_ = float(np.mean(y))
        pred = np.full_like(y, self.init_, dtype=float)
        self.models_ = []

        for _ in range(self.n_estimators):
            residual = y - pred
            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=rng.randint(0, 10**9))
            tree.fit(X, residual)
            update = tree.predict(X)
            pred += self.learning_rate * update
            self.models_.append(tree)

        return self

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        pred = np.full(X.shape[0], self.init_, dtype=float)
        for tree in self.models_:
            pred += self.learning_rate * tree.predict(X)
        return pred


class GradientBoostingClassifierScratch:
    def __init__(self, n_estimators=200, learning_rate=0.1, max_depth=2, random_state=42):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.random_state = random_state
        self.init_logit_ = 0.0
        self.models_ = []

    @staticmethod
    def _sigmoid(z):
        z = np.clip(z, -50, 50)
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)

        rng = np.random.RandomState(self.random_state)

        # init logit = log(p/(1-p))
        p0 = np.clip(np.mean(y), 1e-6, 1 - 1e-6)
        self.init_logit_ = float(np.log(p0 / (1 - p0)))

        logit = np.full(X.shape[0], self.init_logit_, dtype=float)
        self.models_ = []

        for _ in range(self.n_estimators):
            p = self._sigmoid(logit)
            # псевдо-градиент для logloss: (y - p)
            grad = y - p

            tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=rng.randint(0, 10**9))
            tree.fit(X, grad)
            update = tree.predict(X)

            logit += self.learning_rate * update
            self.models_.append(tree)

        return self

    def predict_proba(self, X):
        X = np.asarray(X, dtype=float)
        logit = np.full(X.shape[0], self.init_logit_, dtype=float)
        for tree in self.models_:
            logit += self.learning_rate * tree.predict(X)
        return self._sigmoid(logit)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)


In [14]:
# Walmart scratch
Xw_train_mat = preprocess_wm.fit_transform(Xw_train)
Xw_test_mat  = preprocess_wm.transform(Xw_test)

gb_s_reg = GradientBoostingRegressorScratch(n_estimators=200, learning_rate=0.1, max_depth=2, random_state=RANDOM_STATE)
gb_s_reg.fit(Xw_train_mat, yw_train)
yw_pred_s_base = gb_s_reg.predict(Xw_test_mat)

rmse_s_base = np.sqrt(mean_squared_error(yw_test, yw_pred_s_base))
r2_s_base = r2_score(yw_test, yw_pred_s_base)

print("=== Walmart SCRATCH GB baseline ===")
print(f"RMSE: {rmse_s_base:.4f}")
print(f"R^2 : {r2_s_base:.4f}")

# Spotify scratch
Xs_train_mat = preprocess_sp.fit_transform(Xs_train)
Xs_test_mat  = preprocess_sp.transform(Xs_test)

gb_s_cls = GradientBoostingClassifierScratch(n_estimators=250, learning_rate=0.1, max_depth=2, random_state=RANDOM_STATE)
gb_s_cls.fit(Xs_train_mat, ys_train)

ys_pred_s_base = gb_s_cls.predict(Xs_test_mat)
ys_proba_s_base = gb_s_cls.predict_proba(Xs_test_mat)

f1_s_base = f1_score(ys_test, ys_pred_s_base, zero_division=0)
auc_s_base = roc_auc_score(ys_test, ys_proba_s_base)

print("\n=== Spotify SCRATCH GB baseline ===")
print(f"F1 : {f1_s_base:.4f}")
print(f"AUC: {auc_s_base:.4f}")


=== Walmart SCRATCH GB baseline ===
RMSE: 202291.4567
R^2 : 0.8730

=== Spotify SCRATCH GB baseline ===
F1 : 0.0000
AUC: 0.4779


In [15]:
# Walmart scratch improved
gb_s_reg2 = GradientBoostingRegressorScratch(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=RANDOM_STATE)
gb_s_reg2.fit(Xw_train_mat, yw_train)
yw_pred_s_imp = gb_s_reg2.predict(Xw_test_mat)

rmse_s_imp = np.sqrt(mean_squared_error(yw_test, yw_pred_s_imp))
r2_s_imp = r2_score(yw_test, yw_pred_s_imp)

print("=== Walmart SCRATCH GB improved ===")
print(f"RMSE: {rmse_s_imp:.4f}")
print(f"R^2 : {r2_s_imp:.4f}")

# Spotify scratch improved
gb_s_cls2 = GradientBoostingClassifierScratch(n_estimators=600, learning_rate=0.05, max_depth=3, random_state=RANDOM_STATE)
gb_s_cls2.fit(Xs_train_mat, ys_train)

ys_pred_s_imp = gb_s_cls2.predict(Xs_test_mat)
ys_proba_s_imp = gb_s_cls2.predict_proba(Xs_test_mat)

f1_s_imp = f1_score(ys_test, ys_pred_s_imp, zero_division=0)
auc_s_imp = roc_auc_score(ys_test, ys_proba_s_imp)

print("\n=== Spotify SCRATCH GB improved ===")
print(f"F1 : {f1_s_imp:.4f}")
print(f"AUC: {auc_s_imp:.4f}")


=== Walmart SCRATCH GB improved ===
RMSE: 147868.8385
R^2 : 0.9321

=== Spotify SCRATCH GB improved ===
F1 : 0.0000
AUC: 0.4917


In [16]:
print("===== FINAL SUMMARY LAB 5 =====")

print("\nWalmart (Regression):")
print(f"SKLEARN GB baseline  RMSE={rmse_gb_base:.4f}, R2={r2_gb_base:.4f}")
print(f"SKLEARN GB improved  RMSE={rmse_gb_imp:.4f}, R2={r2_gb_imp:.4f}")
print(f"SCRATCH GB baseline  RMSE={rmse_s_base:.4f}, R2={r2_s_base:.4f}")
print(f"SCRATCH GB improved  RMSE={rmse_s_imp:.4f}, R2={r2_s_imp:.4f}")

print("\nSpotify (Classification):")
print(f"SKLEARN GB baseline  F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"SKLEARN GB improved  F1={f1_imp:.4f}, AUC={auc_imp:.4f}")
print(f"SCRATCH GB baseline  F1={f1_s_base:.4f}, AUC={auc_s_base:.4f}")
print(f"SCRATCH GB improved  F1={f1_s_imp:.4f}, AUC={auc_s_imp:.4f}")


===== FINAL SUMMARY LAB 5 =====

Walmart (Regression):
SKLEARN GB baseline  RMSE=194819.0911, R2=0.8822
SKLEARN GB improved  RMSE=118855.8977, R2=0.9561
SCRATCH GB baseline  RMSE=202291.4567, R2=0.8730
SCRATCH GB improved  RMSE=147868.8385, R2=0.9321

Spotify (Classification):
SKLEARN GB baseline  F1=0.0048, AUC=0.5018
SKLEARN GB improved  F1=0.0355, AUC=0.5026
SCRATCH GB baseline  F1=0.0000, AUC=0.4779
SCRATCH GB improved  F1=0.0000, AUC=0.4917


В лабораторной работе был исследован градиентный бустинг для задач регрессии и классификации. Бустинг показал высокое качество за счёт последовательного исправления ошибок предыдущих моделей. Подбор ключевых гиперпараметров (число деревьев, learning_rate, глубина слабых моделей и subsample) позволил улучшить метрики на тестовой выборке.

Дополнительно была реализована упрощённая версия бустинга “с нуля”. Результаты scratch-реализации подтвердили общую идею бустинга: увеличение числа итераций при уменьшении шага обучения приводит к более стабильной и точной модели, но увеличивает время обучения.