In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)


In [2]:
walmart_path = "dataset/Walmart_Sales.csv"
spotify_path = "dataset/spotify_churn_dataset.csv"

df_wm = pd.read_csv(walmart_path)
df_sp = pd.read_csv(spotify_path)

print("Walmart:", df_wm.shape)
display(df_wm.head())

print("\nSpotify:", df_sp.shape)
display(df_sp.head())


Walmart: (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106



Spotify: (8000, 12)


Unnamed: 0,user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
0,1,Female,54,CA,Free,26,23,0.2,Desktop,31,0,1
1,2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
2,3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
3,4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
4,5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1


In [3]:
wm_target_candidates = ["Weekly_Sales", "weekly_sales", "Sales", "sales", "Target", "target"]
wm_target = next((c for c in wm_target_candidates if c in df_wm.columns), None)

sp_target_candidates = ["churn", "Churn", "is_churn", "IsChurn", "label", "Label", "target", "Target", "y", "is_churned"]
sp_target = next((c for c in sp_target_candidates if c in df_sp.columns), None)

print("Detected Walmart target:", wm_target)
print("Detected Spotify target:", sp_target)

if wm_target is None:
    raise ValueError("Не нашёл target в Walmart_Sales.csv. Укажи wm_target вручную.")
if sp_target is None:
    raise ValueError("Не нашёл target в spotify_churn_dataset.csv. Укажи sp_target вручную.")


Detected Walmart target: Weekly_Sales
Detected Spotify target: is_churned


In [4]:
df_wm_reg = df_wm.copy()

date_col_candidates = ["Date", "date", "DATE"]
date_col = next((c for c in date_col_candidates if c in df_wm_reg.columns), None)

if date_col is not None:
    df_wm_reg[date_col] = pd.to_datetime(df_wm_reg[date_col], errors="coerce")
    df_wm_reg["Year"] = df_wm_reg[date_col].dt.year
    df_wm_reg["Month"] = df_wm_reg[date_col].dt.month
    df_wm_reg["WeekOfYear"] = df_wm_reg[date_col].dt.isocalendar().week.astype(float)
    df_wm_reg["Day"] = df_wm_reg[date_col].dt.day
    df_wm_reg = df_wm_reg.drop(columns=[date_col])

X_wm = df_wm_reg.drop(columns=[wm_target])
y_wm = df_wm_reg[wm_target].astype(float)

Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_wm, y_wm, test_size=0.2, random_state=RANDOM_STATE
)

num_cols_wm = Xw_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_wm = [c for c in Xw_train.columns if c not in num_cols_wm]

print("Train:", Xw_train.shape, "Test:", Xw_test.shape)
print("Numeric:", len(num_cols_wm), "Categorical:", len(cat_cols_wm), cat_cols_wm[:10])


Train: (5148, 10) Test: (1287, 10)
Numeric: 10 Categorical: 0 []


In [5]:
preprocess_wm = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols_wm),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_wm)
    ],
    remainder="drop"
)

wm_tree_base = Pipeline([
    ("prep", preprocess_wm),
    ("model", DecisionTreeRegressor(random_state=RANDOM_STATE))
])

wm_tree_base.fit(Xw_train, yw_train)
yw_pred_tree_base = wm_tree_base.predict(Xw_test)

mae_tree_base = mean_absolute_error(yw_test, yw_pred_tree_base)
rmse_tree_base = np.sqrt(mean_squared_error(yw_test, yw_pred_tree_base))
r2_tree_base = r2_score(yw_test, yw_pred_tree_base)

print("=== Walmart DecisionTreeRegressor baseline ===")
print(f"MAE : {mae_tree_base:.4f}")
print(f"RMSE: {rmse_tree_base:.4f}")
print(f"R^2 : {r2_tree_base:.4f}")


=== Walmart DecisionTreeRegressor baseline ===
MAE : 98139.8657
RMSE: 195086.3388
R^2 : 0.8819


In [6]:
wm_tree_pipe = Pipeline([
    ("prep", preprocess_wm),
    ("model", DecisionTreeRegressor(random_state=RANDOM_STATE))
])

param_grid_wm_tree = {
    "model__max_depth": [None, 3, 5, 7, 10, 15],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__max_features": [None, "sqrt", "log2"]
}

cv_wm = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_wm_tree = GridSearchCV(
    estimator=wm_tree_pipe,
    param_grid=param_grid_wm_tree,
    scoring="neg_mean_squared_error",
    cv=cv_wm,
    n_jobs=-1
)

gs_wm_tree.fit(Xw_train, yw_train)
print("Best params:", gs_wm_tree.best_params_)
print("Best CV MSE:", -gs_wm_tree.best_score_)


Best params: {'model__max_depth': 15, 'model__max_features': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
Best CV MSE: 27794693184.861084


In [7]:
wm_tree_best = gs_wm_tree.best_estimator_
yw_pred_tree_imp = wm_tree_best.predict(Xw_test)

mae_tree_imp = mean_absolute_error(yw_test, yw_pred_tree_imp)
rmse_tree_imp = np.sqrt(mean_squared_error(yw_test, yw_pred_tree_imp))
r2_tree_imp = r2_score(yw_test, yw_pred_tree_imp)

print("=== Walmart DecisionTreeRegressor improved ===")
print(f"MAE : {mae_tree_imp:.4f}")
print(f"RMSE: {rmse_tree_imp:.4f}")
print(f"R^2 : {r2_tree_imp:.4f}")

print("\n=== Walmart comparison ===")
print(f"Baseline RMSE={rmse_tree_base:.4f}, R^2={r2_tree_base:.4f}")
print(f"Improved RMSE={rmse_tree_imp:.4f}, R^2={r2_tree_imp:.4f}")


=== Walmart DecisionTreeRegressor improved ===
MAE : 91607.7300
RMSE: 181142.6310
R^2 : 0.8981

=== Walmart comparison ===
Baseline RMSE=195086.3388, R^2=0.8819
Improved RMSE=181142.6310, R^2=0.8981


In [8]:
X_sp = df_sp.drop(columns=[sp_target])
y_sp = df_sp[sp_target]

if y_sp.dtype == "object":
    y_map = y_sp.astype(str).str.lower().map({"yes": 1, "no": 0, "true": 1, "false": 0})
    if y_map.isna().any():
        y_sp, classes = pd.factorize(y_sp)
        print("Target factorized:", list(classes))
    else:
        y_sp = y_map

Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X_sp, y_sp, test_size=0.2, random_state=RANDOM_STATE, stratify=y_sp
)

num_cols_sp = Xs_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_sp = [c for c in Xs_train.columns if c not in num_cols_sp]

print("Train:", Xs_train.shape, "Test:", Xs_test.shape)
print("Numeric:", len(num_cols_sp), "Categorical:", len(cat_cols_sp), cat_cols_sp[:10])


Train: (6400, 11) Test: (1600, 11)
Numeric: 7 Categorical: 4 ['gender', 'country', 'subscription_type', 'device_type']


In [9]:
preprocess_sp = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols_sp),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_sp)
    ],
    remainder="drop"
)

sp_tree_base = Pipeline([
    ("prep", preprocess_sp),
    ("model", DecisionTreeClassifier(random_state=RANDOM_STATE))
])

sp_tree_base.fit(Xs_train, ys_train)
ys_pred_tree_base = sp_tree_base.predict(Xs_test)

# для ROC-AUC нужны вероятности класса 1
ys_proba_tree_base = sp_tree_base.predict_proba(Xs_test)[:, 1]

acc_base = accuracy_score(ys_test, ys_pred_tree_base)
prec_base = precision_score(ys_test, ys_pred_tree_base, zero_division=0)
rec_base = recall_score(ys_test, ys_pred_tree_base, zero_division=0)
f1_base = f1_score(ys_test, ys_pred_tree_base, zero_division=0)
auc_base = roc_auc_score(ys_test, ys_proba_tree_base)

print("=== Spotify DecisionTreeClassifier baseline ===")
print(f"Accuracy : {acc_base:.4f}")
print(f"Precision: {prec_base:.4f}")
print(f"Recall   : {rec_base:.4f}")
print(f"F1-score : {f1_base:.4f}")
print(f"ROC-AUC  : {auc_base:.4f}")


=== Spotify DecisionTreeClassifier baseline ===
Accuracy : 0.5956
Precision: 0.2382
Recall   : 0.2560
F1-score : 0.2468
ROC-AUC  : 0.4851


Улучшение (Spotify): подбор гиперпараметров дерева

Гипотезы:

ограничение глубины улучшит обобщение

min_samples_leaf снизит переобучение

class_weight="balanced" может помочь при дисбалансе классов

In [10]:
sp_tree_pipe = Pipeline([
    ("prep", preprocess_sp),
    ("model", DecisionTreeClassifier(random_state=RANDOM_STATE))
])

param_grid_sp_tree = {
    "model__max_depth": [None, 3, 5, 7, 10, 15],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__max_features": [None, "sqrt", "log2"],
    "model__class_weight": [None, "balanced"]
}

cv_sp = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_sp_tree = GridSearchCV(
    estimator=sp_tree_pipe,
    param_grid=param_grid_sp_tree,
    scoring="f1",
    cv=cv_sp,
    n_jobs=-1
)

gs_sp_tree.fit(Xs_train, ys_train)
print("Best params:", gs_sp_tree.best_params_)
print("Best CV F1:", gs_sp_tree.best_score_)


Best params: {'model__class_weight': 'balanced', 'model__max_depth': 5, 'model__max_features': 'log2', 'model__min_samples_leaf': 1, 'model__min_samples_split': 20}
Best CV F1: 0.3741188633240967


In [11]:
sp_tree_best = gs_sp_tree.best_estimator_

ys_pred_imp = sp_tree_best.predict(Xs_test)
ys_proba_imp = sp_tree_best.predict_proba(Xs_test)[:, 1]

acc_imp = accuracy_score(ys_test, ys_pred_imp)
prec_imp = precision_score(ys_test, ys_pred_imp, zero_division=0)
rec_imp = recall_score(ys_test, ys_pred_imp, zero_division=0)
f1_imp = f1_score(ys_test, ys_pred_imp, zero_division=0)
auc_imp = roc_auc_score(ys_test, ys_proba_imp)

print("=== Spotify DecisionTreeClassifier improved ===")
print(f"Accuracy : {acc_imp:.4f}")
print(f"Precision: {prec_imp:.4f}")
print(f"Recall   : {rec_imp:.4f}")
print(f"F1-score : {f1_imp:.4f}")
print(f"ROC-AUC  : {auc_imp:.4f}")

print("\n=== Spotify comparison ===")
print(f"Baseline F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"Improved F1={f1_imp:.4f}, AUC={auc_imp:.4f}")


=== Spotify DecisionTreeClassifier improved ===
Accuracy : 0.5331
Precision: 0.2511
Recall   : 0.4058
F1-score : 0.3102
ROC-AUC  : 0.5126

=== Spotify comparison ===
Baseline F1=0.2468, AUC=0.4851
Improved F1=0.3102, AUC=0.5126


In [12]:
class SimpleTreeNode:
    __slots__ = ("feature", "threshold", "left", "right", "value")
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # leaf value


def mse(y):
    y = np.asarray(y, dtype=float)
    if len(y) == 0:
        return 0.0
    return np.mean((y - y.mean()) ** 2)


def gini(y):
    y = np.asarray(y, dtype=int)
    if len(y) == 0:
        return 0.0
    _, counts = np.unique(y, return_counts=True)
    p = counts / counts.sum()
    return 1.0 - np.sum(p * p)


class DecisionTreeRegressorScratch:
    def __init__(self, max_depth=6, min_samples_leaf=5, n_features_try=None):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.n_features_try = n_features_try
        self.root = None

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)
        self.root = self._build(X, y, depth=0)
        return self

    def _best_split(self, X, y):
        n, d = X.shape
        features = np.arange(d)
        if self.n_features_try is not None and self.n_features_try < d:
            features = np.random.choice(features, self.n_features_try, replace=False)

        best_feature, best_thr = None, None
        best_score = float("inf")

        for f in features:
            xs = X[:, f]
            # кандидаты порога — несколько квантилей (быстро и стабильно)
            qs = np.unique(np.quantile(xs, [0.1, 0.3, 0.5, 0.7, 0.9]))
            for thr in qs:
                left_mask = xs <= thr
                right_mask = ~left_mask
                if left_mask.sum() < self.min_samples_leaf or right_mask.sum() < self.min_samples_leaf:
                    continue
                score = (left_mask.sum()/n)*mse(y[left_mask]) + (right_mask.sum()/n)*mse(y[right_mask])
                if score < best_score:
                    best_score = score
                    best_feature, best_thr = f, thr

        return best_feature, best_thr, best_score

    def _build(self, X, y, depth):
        # условия остановки
        if depth >= self.max_depth or len(y) <= 2*self.min_samples_leaf:
            return SimpleTreeNode(value=float(np.mean(y)))

        f, thr, score = self._best_split(X, y)
        if f is None:
            return SimpleTreeNode(value=float(np.mean(y)))

        left_mask = X[:, f] <= thr
        right_mask = ~left_mask

        left = self._build(X[left_mask], y[left_mask], depth+1)
        right = self._build(X[right_mask], y[right_mask], depth+1)
        return SimpleTreeNode(feature=f, threshold=float(thr), left=left, right=right, value=None)

    def predict_one(self, x):
        node = self.root
        while node.value is None:
            node = node.left if x[node.feature] <= node.threshold else node.right
        return node.value

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        return np.array([self.predict_one(x) for x in X], dtype=float)


class DecisionTreeClassifierScratch:
    def __init__(self, max_depth=6, min_samples_leaf=5, n_features_try=None):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.n_features_try = n_features_try
        self.root = None

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=int)
        self.root = self._build(X, y, depth=0)
        return self

    def _best_split(self, X, y):
        n, d = X.shape
        features = np.arange(d)
        if self.n_features_try is not None and self.n_features_try < d:
            features = np.random.choice(features, self.n_features_try, replace=False)

        best_feature, best_thr = None, None
        best_score = float("inf")

        for f in features:
            xs = X[:, f]
            qs = np.unique(np.quantile(xs, [0.1, 0.3, 0.5, 0.7, 0.9]))
            for thr in qs:
                left_mask = xs <= thr
                right_mask = ~left_mask
                if left_mask.sum() < self.min_samples_leaf or right_mask.sum() < self.min_samples_leaf:
                    continue
                score = (left_mask.sum()/n)*gini(y[left_mask]) + (right_mask.sum()/n)*gini(y[right_mask])
                if score < best_score:
                    best_score = score
                    best_feature, best_thr = f, thr

        return best_feature, best_thr, best_score

    def _leaf_value(self, y):
        vals, cnt = np.unique(y, return_counts=True)
        return int(vals[np.argmax(cnt)])

    def _build(self, X, y, depth):
        if depth >= self.max_depth or len(y) <= 2*self.min_samples_leaf:
            return SimpleTreeNode(value=self._leaf_value(y))

        f, thr, score = self._best_split(X, y)
        if f is None:
            return SimpleTreeNode(value=self._leaf_value(y))

        left_mask = X[:, f] <= thr
        right_mask = ~left_mask

        left = self._build(X[left_mask], y[left_mask], depth+1)
        right = self._build(X[right_mask], y[right_mask], depth+1)
        return SimpleTreeNode(feature=f, threshold=float(thr), left=left, right=right, value=None)

    def predict_one(self, x):
        node = self.root
        while node.value is None:
            node = node.left if x[node.feature] <= node.threshold else node.right
        return int(node.value)

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        return np.array([self.predict_one(x) for x in X], dtype=int)


In [13]:
# Walmart: матрицы
Xw_train_mat = preprocess_wm.fit_transform(Xw_train)
Xw_test_mat  = preprocess_wm.transform(Xw_test)

reg_s = DecisionTreeRegressorScratch(max_depth=6, min_samples_leaf=10)
reg_s.fit(Xw_train_mat, yw_train)
yw_pred_s = reg_s.predict(Xw_test_mat)

mae_s = mean_absolute_error(yw_test, yw_pred_s)
rmse_s = np.sqrt(mean_squared_error(yw_test, yw_pred_s))
r2_s = r2_score(yw_test, yw_pred_s)

print("=== Walmart SCRATCH TreeRegressor ===")
print(f"MAE : {mae_s:.4f}")
print(f"RMSE: {rmse_s:.4f}")
print(f"R^2 : {r2_s:.4f}")

# Spotify: матрицы
Xs_train_mat = preprocess_sp.fit_transform(Xs_train)
Xs_test_mat  = preprocess_sp.transform(Xs_test)

cls_s = DecisionTreeClassifierScratch(max_depth=6, min_samples_leaf=10)
cls_s.fit(Xs_train_mat, ys_train)
ys_pred_s = cls_s.predict(Xs_test_mat)

acc_s = accuracy_score(ys_test, ys_pred_s)
prec_s = precision_score(ys_test, ys_pred_s, zero_division=0)
rec_s = recall_score(ys_test, ys_pred_s, zero_division=0)
f1_s = f1_score(ys_test, ys_pred_s, zero_division=0)

print("\n=== Spotify SCRATCH TreeClassifier ===")
print(f"Accuracy : {acc_s:.4f}")
print(f"Precision: {prec_s:.4f}")
print(f"Recall   : {rec_s:.4f}")
print(f"F1-score : {f1_s:.4f}")


=== Walmart SCRATCH TreeRegressor ===
MAE : 217904.4101
RMSE: 323748.3002
R^2 : 0.6747

=== Spotify SCRATCH TreeClassifier ===
Accuracy : 0.7344
Precision: 0.2800
Recall   : 0.0169
F1-score : 0.0319


In [14]:
# Walmart: матрицы
Xw_train_mat = preprocess_wm.fit_transform(Xw_train)
Xw_test_mat  = preprocess_wm.transform(Xw_test)

reg_s = DecisionTreeRegressorScratch(max_depth=6, min_samples_leaf=10)
reg_s.fit(Xw_train_mat, yw_train)
yw_pred_s = reg_s.predict(Xw_test_mat)

mae_s = mean_absolute_error(yw_test, yw_pred_s)
rmse_s = np.sqrt(mean_squared_error(yw_test, yw_pred_s))
r2_s = r2_score(yw_test, yw_pred_s)

print("=== Walmart SCRATCH TreeRegressor ===")
print(f"MAE : {mae_s:.4f}")
print(f"RMSE: {rmse_s:.4f}")
print(f"R^2 : {r2_s:.4f}")

# Spotify: матрицы
Xs_train_mat = preprocess_sp.fit_transform(Xs_train)
Xs_test_mat  = preprocess_sp.transform(Xs_test)

cls_s = DecisionTreeClassifierScratch(max_depth=6, min_samples_leaf=10)
cls_s.fit(Xs_train_mat, ys_train)
ys_pred_s = cls_s.predict(Xs_test_mat)

acc_s = accuracy_score(ys_test, ys_pred_s)
prec_s = precision_score(ys_test, ys_pred_s, zero_division=0)
rec_s = recall_score(ys_test, ys_pred_s, zero_division=0)
f1_s = f1_score(ys_test, ys_pred_s, zero_division=0)

print("\n=== Spotify SCRATCH TreeClassifier ===")
print(f"Accuracy : {acc_s:.4f}")
print(f"Precision: {prec_s:.4f}")
print(f"Recall   : {rec_s:.4f}")
print(f"F1-score : {f1_s:.4f}")


=== Walmart SCRATCH TreeRegressor ===
MAE : 217904.4101
RMSE: 323748.3002
R^2 : 0.6747

=== Spotify SCRATCH TreeClassifier ===
Accuracy : 0.7344
Precision: 0.2800
Recall   : 0.0169
F1-score : 0.0319


In [15]:
# Walmart scratch improved
reg_s2 = DecisionTreeRegressorScratch(max_depth=10, min_samples_leaf=5)
reg_s2.fit(Xw_train_mat, yw_train)
yw_pred_s2 = reg_s2.predict(Xw_test_mat)

rmse_s2 = np.sqrt(mean_squared_error(yw_test, yw_pred_s2))
r2_s2 = r2_score(yw_test, yw_pred_s2)

print("=== Walmart SCRATCH improved ===")
print(f"RMSE: {rmse_s2:.4f}")
print(f"R^2 : {r2_s2:.4f}")

# Spotify scratch improved
cls_s2 = DecisionTreeClassifierScratch(max_depth=10, min_samples_leaf=5)
cls_s2.fit(Xs_train_mat, ys_train)
ys_pred_s2 = cls_s2.predict(Xs_test_mat)

f1_s2 = f1_score(ys_test, ys_pred_s2, zero_division=0)
print("\n=== Spotify SCRATCH improved ===")
print(f"F1: {f1_s2:.4f}")


=== Walmart SCRATCH improved ===
RMSE: 162392.2297
R^2 : 0.9181

=== Spotify SCRATCH improved ===
F1: 0.1475


In [16]:
print("===== FINAL SUMMARY LAB 3 =====")

print("\nWalmart (Regression):")
print(f"SKLEARN Tree baseline  RMSE={rmse_tree_base:.4f}, R2={r2_tree_base:.4f}")
print(f"SKLEARN Tree improved  RMSE={rmse_tree_imp:.4f}, R2={r2_tree_imp:.4f}")
print(f"SCRATCH Tree baseline  RMSE={rmse_s:.4f}, R2={r2_s:.4f}")
print(f"SCRATCH Tree improved  RMSE={rmse_s2:.4f}, R2={r2_s2:.4f}")

print("\nSpotify (Classification):")
print(f"SKLEARN Tree baseline  F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"SKLEARN Tree improved  F1={f1_imp:.4f}, AUC={auc_imp:.4f}")
print(f"SCRATCH Tree baseline  F1={f1_s:.4f}")
print(f"SCRATCH Tree improved  F1={f1_s2:.4f}")


===== FINAL SUMMARY LAB 3 =====

Walmart (Regression):
SKLEARN Tree baseline  RMSE=195086.3388, R2=0.8819
SKLEARN Tree improved  RMSE=181142.6310, R2=0.8981
SCRATCH Tree baseline  RMSE=323748.3002, R2=0.6747
SCRATCH Tree improved  RMSE=162392.2297, R2=0.9181

Spotify (Classification):
SKLEARN Tree baseline  F1=0.2468, AUC=0.4851
SKLEARN Tree improved  F1=0.3102, AUC=0.5126
SCRATCH Tree baseline  F1=0.0319
SCRATCH Tree improved  F1=0.1475


В лабораторной работе было исследовано решающее дерево для задач регрессии и классификации. Бейзлайн на DecisionTreeRegressor/DecisionTreeClassifier показал базовое качество, однако дерево склонно к переобучению. Подбор гиперпараметров (ограничение глубины, увеличение min_samples_leaf и др.) позволил улучшить качество на тестовой выборке, что подтверждает необходимость регуляризации дерева.

Дополнительно была реализована упрощённая версия дерева “с нуля”, которая показала сопоставимые тенденции: при настройке глубины и минимального числа объектов в листе качество меняется аналогично реализации sklearn.