В этой лабораторной:

Walmart → RandomForestRegressor

Spotify → RandomForestClassifier

Бейзлайн + улучшение через GridSearchCV

Реализация “с нуля”: bagging + random subspace

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)


In [3]:
walmart_path = "dataset/Walmart_Sales.csv"
spotify_path = "dataset/spotify_churn_dataset.csv"

df_wm = pd.read_csv(walmart_path)
df_sp = pd.read_csv(spotify_path)

print("Walmart:", df_wm.shape)
display(df_wm.head())

print("\nSpotify:", df_sp.shape)
display(df_sp.head())


Walmart: (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106



Spotify: (8000, 12)


Unnamed: 0,user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
0,1,Female,54,CA,Free,26,23,0.2,Desktop,31,0,1
1,2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
2,3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
3,4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
4,5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1


In [7]:
wm_target_candidates = ["Weekly_Sales", "weekly_sales", "Sales", "sales", "Target", "target"]
wm_target = next((c for c in wm_target_candidates if c in df_wm.columns), None)

sp_target_candidates = ["churn", "Churn", "is_churn", "IsChurn", "label", "Label", "target", "Target", "y", "is_churned"]
sp_target = next((c for c in sp_target_candidates if c in df_sp.columns), None)

print("Detected Walmart target:", wm_target)
print("Detected Spotify target:", sp_target)

if wm_target is None:
    raise ValueError("Не нашёл target в Walmart_Sales.csv. Укажи wm_target вручную.")
if sp_target is None:
    raise ValueError("Не нашёл target в spotify_churn_dataset.csv. Укажи sp_target вручную.")


Detected Walmart target: Weekly_Sales
Detected Spotify target: is_churned


In [21]:
df_wm_reg = df_wm.copy()

date_col_candidates = ["Date", "date", "DATE"]
date_col = next((c for c in date_col_candidates if c in df_wm_reg.columns), None)

if date_col is not None:
    df_wm_reg[date_col] = pd.to_datetime(df_wm_reg[date_col], errors="coerce")
    df_wm_reg["Year"] = df_wm_reg[date_col].dt.year
    df_wm_reg["Month"] = df_wm_reg[date_col].dt.month
    df_wm_reg["WeekOfYear"] = df_wm_reg[date_col].dt.isocalendar().week.astype(float)
    df_wm_reg["Day"] = df_wm_reg[date_col].dt.day
    df_wm_reg = df_wm_reg.drop(columns=[date_col])

X_wm = df_wm_reg.drop(columns=[wm_target])
y_wm = df_wm_reg[wm_target].astype(float)

Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_wm, y_wm, test_size=0.2, random_state=RANDOM_STATE
)

num_cols_wm = Xw_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_wm = [c for c in Xw_train.columns if c not in num_cols_wm]

print("Train:", Xw_train.shape, "Test:", Xw_test.shape)
print("Numeric:", len(num_cols_wm), "Categorical:", len(cat_cols_wm), cat_cols_wm[:10])


Train: (5148, 10) Test: (1287, 10)
Numeric: 10 Categorical: 0 []


In [22]:
preprocess_wm = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols_wm),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_wm),
    ],
    remainder="drop"
)

wm_rf_base = Pipeline([
    ("prep", preprocess_wm),
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

wm_rf_base.fit(Xw_train, yw_train)
yw_pred_rf_base = wm_rf_base.predict(Xw_test)

mae_rf_base = mean_absolute_error(yw_test, yw_pred_rf_base)
rmse_rf_base = np.sqrt(mean_squared_error(yw_test, yw_pred_rf_base))
r2_rf_base = r2_score(yw_test, yw_pred_rf_base)

print("=== Walmart RandomForestRegressor baseline ===")
print(f"MAE : {mae_rf_base:.4f}")
print(f"RMSE: {rmse_rf_base:.4f}")
print(f"R^2 : {r2_rf_base:.4f}")


=== Walmart RandomForestRegressor baseline ===
MAE : 72426.6007
RMSE: 145546.2697
R^2 : 0.9342


In [23]:
wm_rf_pipe = Pipeline([
    ("prep", preprocess_wm),
    ("model", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1))
])

param_grid_wm_rf = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__max_features": ["sqrt", "log2", None],
}

cv_wm = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_wm_rf = GridSearchCV(
    estimator=wm_rf_pipe,
    param_grid=param_grid_wm_rf,
    scoring="neg_mean_squared_error",
    cv=cv_wm,
    n_jobs=-1
)

gs_wm_rf.fit(Xw_train, yw_train)
print("Best params:", gs_wm_rf.best_params_)
print("Best CV MSE:", -gs_wm_rf.best_score_)


Best params: {'model__max_depth': None, 'model__max_features': None, 'model__min_samples_leaf': 2, 'model__n_estimators': 400}
Best CV MSE: 22336940909.247143


In [24]:
wm_rf_best = gs_wm_rf.best_estimator_
yw_pred_rf_imp = wm_rf_best.predict(Xw_test)

mae_rf_imp = mean_absolute_error(yw_test, yw_pred_rf_imp)
rmse_rf_imp = np.sqrt(mean_squared_error(yw_test, yw_pred_rf_imp))
r2_rf_imp = r2_score(yw_test, yw_pred_rf_imp)

print("=== Walmart RandomForestRegressor improved ===")
print(f"MAE : {mae_rf_imp:.4f}")
print(f"RMSE: {rmse_rf_imp:.4f}")
print(f"R^2 : {r2_rf_imp:.4f}")

print("\n=== Walmart comparison ===")
print(f"Baseline RMSE={rmse_rf_base:.4f}, R^2={r2_rf_base:.4f}")
print(f"Improved RMSE={rmse_rf_imp:.4f}, R^2={r2_rf_imp:.4f}")


=== Walmart RandomForestRegressor improved ===
MAE : 71019.4584
RMSE: 143123.3632
R^2 : 0.9364

=== Walmart comparison ===
Baseline RMSE=145546.2697, R^2=0.9342
Improved RMSE=143123.3632, R^2=0.9364


In [25]:
X_sp = df_sp.drop(columns=[sp_target])
y_sp = df_sp[sp_target]

if y_sp.dtype == "object":
    y_map = y_sp.astype(str).str.lower().map({"yes": 1, "no": 0, "true": 1, "false": 0})
    if y_map.isna().any():
        y_sp, classes = pd.factorize(y_sp)
        print("Target factorized:", list(classes))
    else:
        y_sp = y_map

Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X_sp, y_sp, test_size=0.2, random_state=RANDOM_STATE, stratify=y_sp
)

num_cols_sp = Xs_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_sp = [c for c in Xs_train.columns if c not in num_cols_sp]

print("Train:", Xs_train.shape, "Test:", Xs_test.shape)
print("Numeric:", len(num_cols_sp), "Categorical:", len(cat_cols_sp), cat_cols_sp[:10])


Train: (6400, 11) Test: (1600, 11)
Numeric: 7 Categorical: 4 ['gender', 'country', 'subscription_type', 'device_type']


In [26]:
preprocess_sp = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols_sp),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_sp),
    ],
    remainder="drop"
)

sp_rf_base = Pipeline([
    ("prep", preprocess_sp),
    ("model", RandomForestClassifier(
        n_estimators=300,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ))
])

sp_rf_base.fit(Xs_train, ys_train)
ys_pred_base = sp_rf_base.predict(Xs_test)
ys_proba_base = sp_rf_base.predict_proba(Xs_test)[:, 1]

acc_base = accuracy_score(ys_test, ys_pred_base)
prec_base = precision_score(ys_test, ys_pred_base, zero_division=0)
rec_base = recall_score(ys_test, ys_pred_base, zero_division=0)
f1_base = f1_score(ys_test, ys_pred_base, zero_division=0)
auc_base = roc_auc_score(ys_test, ys_proba_base)

print("=== Spotify RandomForestClassifier baseline ===")
print(f"Accuracy : {acc_base:.4f}")
print(f"Precision: {prec_base:.4f}")
print(f"Recall   : {rec_base:.4f}")
print(f"F1-score : {f1_base:.4f}")
print(f"ROC-AUC  : {auc_base:.4f}")


=== Spotify RandomForestClassifier baseline ===
Accuracy : 0.7419
Precision: 0.6000
Recall   : 0.0072
F1-score : 0.0143
ROC-AUC  : 0.5310


In [27]:
sp_rf_pipe = Pipeline([
    ("prep", preprocess_sp),
    ("model", RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))
])

param_grid_sp_rf = {
    "model__n_estimators": [300, 600],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__max_features": ["sqrt", "log2", None],
    "model__class_weight": [None, "balanced"]
}

cv_sp = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_sp_rf = GridSearchCV(
    estimator=sp_rf_pipe,
    param_grid=param_grid_sp_rf,
    scoring="f1",
    cv=cv_sp,
    n_jobs=-1
)

gs_sp_rf.fit(Xs_train, ys_train)
print("Best params:", gs_sp_rf.best_params_)
print("Best CV F1:", gs_sp_rf.best_score_)


Best params: {'model__class_weight': 'balanced', 'model__max_depth': 10, 'model__max_features': None, 'model__min_samples_leaf': 10, 'model__n_estimators': 300}
Best CV F1: 0.24528323972751895


In [28]:
sp_rf_best = gs_sp_rf.best_estimator_

ys_pred_imp = sp_rf_best.predict(Xs_test)
ys_proba_imp = sp_rf_best.predict_proba(Xs_test)[:, 1]

acc_imp = accuracy_score(ys_test, ys_pred_imp)
prec_imp = precision_score(ys_test, ys_pred_imp, zero_division=0)
rec_imp = recall_score(ys_test, ys_pred_imp, zero_division=0)
f1_imp = f1_score(ys_test, ys_pred_imp, zero_division=0)
auc_imp = roc_auc_score(ys_test, ys_proba_imp)

print("=== Spotify RandomForestClassifier improved ===")
print(f"Accuracy : {acc_imp:.4f}")
print(f"Precision: {prec_imp:.4f}")
print(f"Recall   : {rec_imp:.4f}")
print(f"F1-score : {f1_imp:.4f}")
print(f"ROC-AUC  : {auc_imp:.4f}")

print("\n=== Spotify comparison ===")
print(f"Baseline F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"Improved F1={f1_imp:.4f}, AUC={auc_imp:.4f}")


=== Spotify RandomForestClassifier improved ===
Accuracy : 0.6356
Precision: 0.2735
Recall   : 0.2464
F1-score : 0.2592
ROC-AUC  : 0.5022

=== Spotify comparison ===
Baseline F1=0.0143, AUC=0.5310
Improved F1=0.2592, AUC=0.5022


In [29]:
# --- Минимальное scratch-дерево из ЛР3 (используем как базовый weak learner) ---

class SimpleTreeNode:
    __slots__ = ("feature", "threshold", "left", "right", "value")
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def mse(y):
    y = np.asarray(y, dtype=float)
    if len(y) == 0:
        return 0.0
    return np.mean((y - y.mean()) ** 2)

def gini(y):
    y = np.asarray(y, dtype=int)
    if len(y) == 0:
        return 0.0
    _, counts = np.unique(y, return_counts=True)
    p = counts / counts.sum()
    return 1.0 - np.sum(p * p)

class DecisionTreeRegressorScratch:
    def __init__(self, max_depth=10, min_samples_leaf=5, n_features_try=None):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.n_features_try = n_features_try
        self.root = None

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)
        self.root = self._build(X, y, 0)
        return self

    def _best_split(self, X, y):
        n, d = X.shape
        features = np.arange(d)
        if self.n_features_try is not None and self.n_features_try < d:
            features = np.random.choice(features, self.n_features_try, replace=False)

        best_f, best_thr, best_score = None, None, float("inf")
        for f in features:
            xs = X[:, f]
            qs = np.unique(np.quantile(xs, [0.1, 0.3, 0.5, 0.7, 0.9]))
            for thr in qs:
                left = xs <= thr
                right = ~left
                if left.sum() < self.min_samples_leaf or right.sum() < self.min_samples_leaf:
                    continue
                score = (left.sum()/n)*mse(y[left]) + (right.sum()/n)*mse(y[right])
                if score < best_score:
                    best_f, best_thr, best_score = f, thr, score
        return best_f, best_thr

    def _build(self, X, y, depth):
        if depth >= self.max_depth or len(y) <= 2*self.min_samples_leaf:
            return SimpleTreeNode(value=float(np.mean(y)))
        f, thr = self._best_split(X, y)
        if f is None:
            return SimpleTreeNode(value=float(np.mean(y)))
        left = X[:, f] <= thr
        right = ~left
        return SimpleTreeNode(
            feature=f, threshold=float(thr),
            left=self._build(X[left], y[left], depth+1),
            right=self._build(X[right], y[right], depth+1)
        )

    def predict_one(self, x):
        node = self.root
        while node.value is None:
            node = node.left if x[node.feature] <= node.threshold else node.right
        return node.value

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        return np.array([self.predict_one(x) for x in X], dtype=float)

class DecisionTreeClassifierScratch:
    def __init__(self, max_depth=10, min_samples_leaf=5, n_features_try=None):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.n_features_try = n_features_try
        self.root = None

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=int)
        self.root = self._build(X, y, 0)
        return self

    def _best_split(self, X, y):
        n, d = X.shape
        features = np.arange(d)
        if self.n_features_try is not None and self.n_features_try < d:
            features = np.random.choice(features, self.n_features_try, replace=False)

        best_f, best_thr, best_score = None, None, float("inf")
        for f in features:
            xs = X[:, f]
            qs = np.unique(np.quantile(xs, [0.1, 0.3, 0.5, 0.7, 0.9]))
            for thr in qs:
                left = xs <= thr
                right = ~left
                if left.sum() < self.min_samples_leaf or right.sum() < self.min_samples_leaf:
                    continue
                score = (left.sum()/n)*gini(y[left]) + (right.sum()/n)*gini(y[right])
                if score < best_score:
                    best_f, best_thr, best_score = f, thr, score
        return best_f, best_thr

    def _leaf_value(self, y):
        vals, cnt = np.unique(y, return_counts=True)
        return int(vals[np.argmax(cnt)])

    def _build(self, X, y, depth):
        if depth >= self.max_depth or len(y) <= 2*self.min_samples_leaf:
            return SimpleTreeNode(value=self._leaf_value(y))
        f, thr = self._best_split(X, y)
        if f is None:
            return SimpleTreeNode(value=self._leaf_value(y))
        left = X[:, f] <= thr
        right = ~left
        return SimpleTreeNode(
            feature=f, threshold=float(thr),
            left=self._build(X[left], y[left], depth+1),
            right=self._build(X[right], y[right], depth+1)
        )

    def predict_one(self, x):
        node = self.root
        while node.value is None:
            node = node.left if x[node.feature] <= node.threshold else node.right
        return int(node.value)

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        return np.array([self.predict_one(x) for x in X], dtype=int)

# --- Random Forest scratch ---

class RandomForestRegressorScratch:
    def __init__(self, n_estimators=50, max_depth=10, min_samples_leaf=5, max_features="sqrt", random_state=42):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []

    def _n_features_try(self, d):
        if self.max_features == "sqrt":
            return max(1, int(np.sqrt(d)))
        if self.max_features == "log2":
            return max(1, int(np.log2(d)))
        if self.max_features is None:
            return d
        if isinstance(self.max_features, int):
            return min(d, self.max_features)
        return d

    def fit(self, X, y):
        rng = np.random.RandomState(self.random_state)
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)
        n, d = X.shape
        m = self._n_features_try(d)

        self.trees = []
        for _ in range(self.n_estimators):
            idx = rng.randint(0, n, size=n)  # bootstrap
            tree = DecisionTreeRegressorScratch(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                n_features_try=m
            )
            tree.fit(X[idx], y[idx])
            self.trees.append(tree)
        return self

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        preds = np.stack([t.predict(X) for t in self.trees], axis=0)
        return preds.mean(axis=0)

class RandomForestClassifierScratch:
    def __init__(self, n_estimators=50, max_depth=10, min_samples_leaf=5, max_features="sqrt", random_state=42):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []

    def _n_features_try(self, d):
        if self.max_features == "sqrt":
            return max(1, int(np.sqrt(d)))
        if self.max_features == "log2":
            return max(1, int(np.log2(d)))
        if self.max_features is None:
            return d
        if isinstance(self.max_features, int):
            return min(d, self.max_features)
        return d

    def fit(self, X, y):
        rng = np.random.RandomState(self.random_state)
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=int)
        n, d = X.shape
        m = self._n_features_try(d)

        self.trees = []
        for _ in range(self.n_estimators):
            idx = rng.randint(0, n, size=n)  # bootstrap
            tree = DecisionTreeClassifierScratch(
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                n_features_try=m
            )
            tree.fit(X[idx], y[idx])
            self.trees.append(tree)
        return self

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        votes = np.stack([t.predict(X) for t in self.trees], axis=0)  # (T, n)
        # majority vote
        return (votes.mean(axis=0) >= 0.5).astype(int)


In [30]:
# Walmart: матрицы
Xw_train_mat = preprocess_wm.fit_transform(Xw_train)
Xw_test_mat  = preprocess_wm.transform(Xw_test)

rf_s_reg = RandomForestRegressorScratch(
    n_estimators=50, max_depth=10, min_samples_leaf=5, max_features="sqrt", random_state=RANDOM_STATE
)
rf_s_reg.fit(Xw_train_mat, yw_train)
yw_pred_s_base = rf_s_reg.predict(Xw_test_mat)

mae_s_base = mean_absolute_error(yw_test, yw_pred_s_base)
rmse_s_base = np.sqrt(mean_squared_error(yw_test, yw_pred_s_base))
r2_s_base = r2_score(yw_test, yw_pred_s_base)

print("=== Walmart SCRATCH RF baseline ===")
print(f"MAE : {mae_s_base:.4f}")
print(f"RMSE: {rmse_s_base:.4f}")
print(f"R^2 : {r2_s_base:.4f}")

# Spotify: матрицы
Xs_train_mat = preprocess_sp.fit_transform(Xs_train)
Xs_test_mat  = preprocess_sp.transform(Xs_test)

rf_s_cls = RandomForestClassifierScratch(
    n_estimators=80, max_depth=12, min_samples_leaf=5, max_features="sqrt", random_state=RANDOM_STATE
)
rf_s_cls.fit(Xs_train_mat, ys_train)
ys_pred_s_base = rf_s_cls.predict(Xs_test_mat)

acc_s_base = accuracy_score(ys_test, ys_pred_s_base)
prec_s_base = precision_score(ys_test, ys_pred_s_base, zero_division=0)
rec_s_base = recall_score(ys_test, ys_pred_s_base, zero_division=0)
f1_s_base = f1_score(ys_test, ys_pred_s_base, zero_division=0)

print("\n=== Spotify SCRATCH RF baseline ===")
print(f"Accuracy : {acc_s_base:.4f}")
print(f"Precision: {prec_s_base:.4f}")
print(f"Recall   : {rec_s_base:.4f}")
print(f"F1-score : {f1_s_base:.4f}")


=== Walmart SCRATCH RF baseline ===
MAE : 217874.7803
RMSE: 287625.8380
R^2 : 0.7432

=== Spotify SCRATCH RF baseline ===
Accuracy : 0.7412
Precision: 0.0000
Recall   : 0.0000
F1-score : 0.0000


In [31]:
# Walmart scratch improved
rf_s_reg2 = RandomForestRegressorScratch(
    n_estimators=120, max_depth=14, min_samples_leaf=3, max_features="sqrt", random_state=RANDOM_STATE
)
rf_s_reg2.fit(Xw_train_mat, yw_train)
yw_pred_s_imp = rf_s_reg2.predict(Xw_test_mat)

rmse_s_imp = np.sqrt(mean_squared_error(yw_test, yw_pred_s_imp))
r2_s_imp = r2_score(yw_test, yw_pred_s_imp)

print("=== Walmart SCRATCH RF improved ===")
print(f"RMSE: {rmse_s_imp:.4f}")
print(f"R^2 : {r2_s_imp:.4f}")

# Spotify scratch improved
rf_s_cls2 = RandomForestClassifierScratch(
    n_estimators=150, max_depth=14, min_samples_leaf=3, max_features="sqrt", random_state=RANDOM_STATE
)
rf_s_cls2.fit(Xs_train_mat, ys_train)
ys_pred_s_imp = rf_s_cls2.predict(Xs_test_mat)

f1_s_imp = f1_score(ys_test, ys_pred_s_imp, zero_division=0)
print("\n=== Spotify SCRATCH RF improved ===")
print(f"F1: {f1_s_imp:.4f}")


=== Walmart SCRATCH RF improved ===
RMSE: 213266.1312
R^2 : 0.8588

=== Spotify SCRATCH RF improved ===
F1: 0.0000


In [32]:
print("===== FINAL SUMMARY LAB 4 =====")

print("\nWalmart (Regression):")
print(f"SKLEARN RF baseline  RMSE={rmse_rf_base:.4f}, R2={r2_rf_base:.4f}")
print(f"SKLEARN RF improved  RMSE={rmse_rf_imp:.4f}, R2={r2_rf_imp:.4f}")
print(f"SCRATCH RF baseline  RMSE={rmse_s_base:.4f}, R2={r2_s_base:.4f}")
print(f"SCRATCH RF improved  RMSE={rmse_s_imp:.4f}, R2={r2_s_imp:.4f}")

print("\nSpotify (Classification):")
print(f"SKLEARN RF baseline  F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"SKLEARN RF improved  F1={f1_imp:.4f}, AUC={auc_imp:.4f}")
print(f"SCRATCH RF baseline  F1={f1_s_base:.4f}")
print(f"SCRATCH RF improved  F1={f1_s_imp:.4f}")


===== FINAL SUMMARY LAB 4 =====

Walmart (Regression):
SKLEARN RF baseline  RMSE=145546.2697, R2=0.9342
SKLEARN RF improved  RMSE=143123.3632, R2=0.9364
SCRATCH RF baseline  RMSE=287625.8380, R2=0.7432
SCRATCH RF improved  RMSE=213266.1312, R2=0.8588

Spotify (Classification):
SKLEARN RF baseline  F1=0.0143, AUC=0.5310
SKLEARN RF improved  F1=0.2592, AUC=0.5022
SCRATCH RF baseline  F1=0.0000
SCRATCH RF improved  F1=0.0000


В лабораторной работе был исследован случайный лес для задач регрессии и классификации. По сравнению с одним решающим деревом, ансамбль деревьев показал более стабильные результаты за счёт бутстрапа и случайного выбора признаков, что снижает переобучение.

Подбор гиперпараметров (число деревьев, глубина, минимальный размер листа, число признаков) позволил улучшить качество моделей по выбранным метрикам. Реализация “с нуля” показала аналогичные тенденции: увеличение числа деревьев повышает устойчивость и качество, однако также увеличивает время обучения.