In [34]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression

from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)


In [35]:
walmart_path = "dataset/Walmart_Sales.csv"
spotify_path = "dataset/spotify_churn_dataset.csv"

df_wm = pd.read_csv(walmart_path)
df_sp = pd.read_csv(spotify_path)

print("Walmart shape:", df_wm.shape)
display(df_wm.head())

print("\nSpotify shape:", df_sp.shape)
display(df_sp.head())


Walmart shape: (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106



Spotify shape: (8000, 12)


Unnamed: 0,user_id,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
0,1,Female,54,CA,Free,26,23,0.2,Desktop,31,0,1
1,2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
2,3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
3,4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
4,5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1


In [36]:
# Walmart target (регрессия)
wm_target_candidates = ["Weekly_Sales", "weekly_sales", "Sales", "sales", "Target", "target"]
wm_target = next((c for c in wm_target_candidates if c in df_wm.columns), None)
print("Detected Walmart target:", wm_target)

# Spotify target (классификация)
sp_target_candidates = ["churn", "Churn", "is_churn", "IsChurn", "label", "Label", "target", "Target", "y", "is_churned"]
sp_target = next((c for c in sp_target_candidates if c in df_sp.columns), None)
print("Detected Spotify target:", sp_target)

if wm_target is None:
    raise ValueError("Не нашёл target в Walmart_Sales.csv. Укажи wm_target вручную.")
if sp_target is None:
    raise ValueError("Не нашёл target в spotify_churn_dataset.csv. Укажи sp_target вручную.")


Detected Walmart target: Weekly_Sales
Detected Spotify target: is_churned


In [37]:
df_wm_reg = df_wm.copy()

date_col_candidates = ["Date", "date", "DATE"]
date_col = next((c for c in date_col_candidates if c in df_wm_reg.columns), None)

if date_col is not None:
    df_wm_reg[date_col] = pd.to_datetime(df_wm_reg[date_col], errors="coerce")
    df_wm_reg["Year"] = df_wm_reg[date_col].dt.year
    df_wm_reg["Month"] = df_wm_reg[date_col].dt.month
    df_wm_reg["WeekOfYear"] = df_wm_reg[date_col].dt.isocalendar().week.astype(float)
    df_wm_reg["Day"] = df_wm_reg[date_col].dt.day
    # удаляем исходную дату (datetime не подаём в sklearn)
    df_wm_reg = df_wm_reg.drop(columns=[date_col])

display(df_wm_reg.head())
print("Date processed from:", date_col)


Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,WeekOfYear,Day
0,1,1643690.9,0,42.31,2.572,211.096358,8.106,2010.0,5.0,17.0,2.0
1,1,1641957.44,1,38.51,2.548,211.24217,8.106,2010.0,12.0,48.0,2.0
2,1,1611968.17,0,39.93,2.514,211.289143,8.106,,,,
3,1,1409727.59,0,46.63,2.561,211.319643,8.106,,,,
4,1,1554806.68,0,46.5,2.625,211.350143,8.106,2010.0,5.0,18.0,3.0


Date processed from: Date


In [38]:
X_wm = df_wm_reg.drop(columns=[wm_target])
y_wm = df_wm_reg[wm_target].astype(float)

Xw_train, Xw_test, yw_train, yw_test = train_test_split(
    X_wm, y_wm, test_size=0.2, random_state=RANDOM_STATE
)

num_cols_wm = Xw_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_wm = [c for c in Xw_train.columns if c not in num_cols_wm]

print("Train:", Xw_train.shape, "Test:", Xw_test.shape)
print("Numeric cols:", len(num_cols_wm))
print("Categorical cols:", len(cat_cols_wm), cat_cols_wm[:10])


Train: (5148, 10) Test: (1287, 10)
Numeric cols: 10
Categorical cols: 0 []


In [39]:
preprocess_wm = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols_wm),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_wm),
    ],
    remainder="drop"
)

wm_lr_baseline = Pipeline([
    ("prep", preprocess_wm),
    ("model", LinearRegression())
])

wm_lr_baseline.fit(Xw_train, yw_train)
yw_pred_lr_base = wm_lr_baseline.predict(Xw_test)

mae_lr_base = mean_absolute_error(yw_test, yw_pred_lr_base)
rmse_lr_base = np.sqrt(mean_squared_error(yw_test, yw_pred_lr_base))
r2_lr_base = r2_score(yw_test, yw_pred_lr_base)

print("=== Walmart LinearRegression baseline ===")
print(f"MAE : {mae_lr_base:.4f}")
print(f"RMSE: {rmse_lr_base:.4f}")
print(f"R^2 : {r2_lr_base:.4f}")


=== Walmart LinearRegression baseline ===
MAE : 433791.3518
RMSE: 524119.5584
R^2 : 0.1473


In [40]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Ridge, Lasso
from sklearn.exceptions import ConvergenceWarning
import warnings

wm_lr_pipe = Pipeline([
    ("prep", preprocess_wm),
    ("model", Ridge())
])

# Делаем 2 отдельных пространства параметров:
# - Ridge: стабилен
# - Lasso: повышаем max_iter, увеличиваем tol, и добавляем большие alpha
param_grid_wm_lr = [
    {
        "model": [Ridge()],
        "model__alpha": [0.01, 0.1, 1.0, 10.0, 100.0, 300.0, 1000.0]
    },
    {
        "model": [Lasso(max_iter=300000, tol=1e-2, selection="random", random_state=RANDOM_STATE)],
        "model__alpha": [1.0, 10.0, 100.0, 300.0, 1000.0]  # начинаем с более сильной регуляризации
    }
]

cv_wm = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_wm_lr = GridSearchCV(
    estimator=wm_lr_pipe,
    param_grid=param_grid_wm_lr,
    scoring="neg_mean_squared_error",
    cv=cv_wm,
    n_jobs=-1,
    error_score="raise"
)

# Глушим только ConvergenceWarning (это именно предупреждение про сходимость, не скрываем реальные ошибки)
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    gs_wm_lr.fit(Xw_train, yw_train)

print("Best params:", gs_wm_lr.best_params_)
print("Best CV MSE:", -gs_wm_lr.best_score_)


Best params: {'model': Lasso(max_iter=300000, random_state=42, selection='random', tol=0.01), 'model__alpha': 1000.0}
Best CV MSE: 274218215473.35132


In [41]:
wm_lr_best = gs_wm_lr.best_estimator_
yw_pred_lr_imp = wm_lr_best.predict(Xw_test)

mae_lr_imp = mean_absolute_error(yw_test, yw_pred_lr_imp)
rmse_lr_imp = np.sqrt(mean_squared_error(yw_test, yw_pred_lr_imp))
r2_lr_imp = r2_score(yw_test, yw_pred_lr_imp)

print("=== Walmart Linear/Ridge/Lasso improved ===")
print(f"MAE : {mae_lr_imp:.4f}")
print(f"RMSE: {rmse_lr_imp:.4f}")
print(f"R^2 : {r2_lr_imp:.4f}")

print("\n=== Walmart comparison ===")
print(f"Baseline RMSE={rmse_lr_base:.4f}, R^2={r2_lr_base:.4f}")
print(f"Improved RMSE={rmse_lr_imp:.4f}, R^2={r2_lr_imp:.4f}")


=== Walmart Linear/Ridge/Lasso improved ===
MAE : 433482.7219
RMSE: 523813.2528
R^2 : 0.1483

=== Walmart comparison ===
Baseline RMSE=524119.5584, R^2=0.1473
Improved RMSE=523813.2528, R^2=0.1483


In [42]:
df_sp_cls = df_sp.copy()

X_sp = df_sp_cls.drop(columns=[sp_target])
y_sp = df_sp_cls[sp_target]

# приведение к 0/1, если строковое
if y_sp.dtype == "object":
    y_sp2 = y_sp.astype(str).str.lower().map({"yes": 1, "no": 0, "true": 1, "false": 0})
    if y_sp2.isna().any():
        y_sp, classes = pd.factorize(y_sp)
        print("Target factorized:", list(classes))
    else:
        y_sp = y_sp2

Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X_sp, y_sp, test_size=0.2, random_state=RANDOM_STATE, stratify=y_sp
)

num_cols_sp = Xs_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_sp = [c for c in Xs_train.columns if c not in num_cols_sp]

print("Train:", Xs_train.shape, "Test:", Xs_test.shape)
print("Numeric cols:", len(num_cols_sp))
print("Categorical cols:", len(cat_cols_sp), cat_cols_sp[:10])


Train: (6400, 11) Test: (1600, 11)
Numeric cols: 7
Categorical cols: 4 ['gender', 'country', 'subscription_type', 'device_type']


In [43]:
preprocess_sp = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols_sp),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_cols_sp),
    ],
    remainder="drop"
)

sp_logreg_baseline = Pipeline([
    ("prep", preprocess_sp),
    ("model", LogisticRegression(max_iter=2000))
])

sp_logreg_baseline.fit(Xs_train, ys_train)
ys_pred_base = sp_logreg_baseline.predict(Xs_test)
ys_proba_base = sp_logreg_baseline.predict_proba(Xs_test)[:, 1]

acc_base = accuracy_score(ys_test, ys_pred_base)
prec_base = precision_score(ys_test, ys_pred_base, zero_division=0)
rec_base = recall_score(ys_test, ys_pred_base, zero_division=0)
f1_base = f1_score(ys_test, ys_pred_base, zero_division=0)
auc_base = roc_auc_score(ys_test, ys_proba_base)

print("=== Spotify LogisticRegression baseline ===")
print(f"Accuracy : {acc_base:.4f}")
print(f"Precision: {prec_base:.4f}")
print(f"Recall   : {rec_base:.4f}")
print(f"F1-score : {f1_base:.4f}")
print(f"ROC-AUC  : {auc_base:.4f}")


=== Spotify LogisticRegression baseline ===
Accuracy : 0.7412
Precision: 0.0000
Recall   : 0.0000
F1-score : 0.0000
ROC-AUC  : 0.4982


In [44]:
sp_logreg_pipe = Pipeline([
    ("prep", preprocess_sp),
    ("model", LogisticRegression(max_iter=3000))
])

param_grid_sp = {
    "model__C": [0.01, 0.1, 1.0, 10.0],
    "model__class_weight": [None, "balanced"],
    "model__solver": ["lbfgs"]
}

cv_sp = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

gs_sp_logreg = GridSearchCV(
    sp_logreg_pipe,
    param_grid=param_grid_sp,
    scoring="f1",
    cv=cv_sp,
    n_jobs=-1
)

gs_sp_logreg.fit(Xs_train, ys_train)
print("Best params:", gs_sp_logreg.best_params_)
print("Best CV F1:", gs_sp_logreg.best_score_)


Best params: {'model__C': 0.01, 'model__class_weight': 'balanced', 'model__solver': 'lbfgs'}
Best CV F1: 0.350123802937005


In [45]:
sp_logreg_best = gs_sp_logreg.best_estimator_

ys_pred_imp = sp_logreg_best.predict(Xs_test)
ys_proba_imp = sp_logreg_best.predict_proba(Xs_test)[:, 1]

acc_imp = accuracy_score(ys_test, ys_pred_imp)
prec_imp = precision_score(ys_test, ys_pred_imp, zero_division=0)
rec_imp = recall_score(ys_test, ys_pred_imp, zero_division=0)
f1_imp = f1_score(ys_test, ys_pred_imp, zero_division=0)
auc_imp = roc_auc_score(ys_test, ys_proba_imp)

print("=== Spotify LogisticRegression improved ===")
print(f"Accuracy : {acc_imp:.4f}")
print(f"Precision: {prec_imp:.4f}")
print(f"Recall   : {rec_imp:.4f}")
print(f"F1-score : {f1_imp:.4f}")
print(f"ROC-AUC  : {auc_imp:.4f}")

print("\n=== Spotify comparison ===")
print(f"Baseline F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"Improved F1={f1_imp:.4f}, AUC={auc_imp:.4f}")


=== Spotify LogisticRegression improved ===
Accuracy : 0.5156
Precision: 0.2603
Recall   : 0.4734
F1-score : 0.3359
ROC-AUC  : 0.4965

=== Spotify comparison ===
Baseline F1=0.0000, AUC=0.4982
Improved F1=0.3359, AUC=0.4965


In [46]:
class LinearRegressionGD:
    def __init__(self, lr=0.05, n_iter=3000, l2=0.0):
        self.lr = lr
        self.n_iter = n_iter
        self.l2 = l2
        self.w = None
        self.b = 0.0

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)
        n, d = X.shape
        self.w = np.zeros(d)
        self.b = 0.0

        for _ in range(self.n_iter):
            y_pred = X @ self.w + self.b
            err = y_pred - y
            grad_w = (2/n) * (X.T @ err) + 2*self.l2*self.w
            grad_b = (2/n) * np.sum(err)

            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b

        return self

    def predict(self, X):
        X = np.asarray(X, dtype=float)
        return X @ self.w + self.b


class LogisticRegressionGD:
    def __init__(self, lr=0.05, n_iter=5000, l2=0.0):
        self.lr = lr
        self.n_iter = n_iter
        self.l2 = l2
        self.w = None
        self.b = 0.0

    @staticmethod
    def _sigmoid(z):
        z = np.clip(z, -50, 50)
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float)
        n, d = X.shape
        self.w = np.zeros(d)
        self.b = 0.0

        for _ in range(self.n_iter):
            z = X @ self.w + self.b
            p = self._sigmoid(z)
            # градиенты logloss
            grad_w = (1/n) * (X.T @ (p - y)) + 2*self.l2*self.w
            grad_b = (1/n) * np.sum(p - y)

            self.w -= self.lr * grad_w
            self.b -= self.lr * grad_b

        return self

    def predict_proba(self, X):
        X = np.asarray(X, dtype=float)
        return self._sigmoid(X @ self.w + self.b)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) >= threshold).astype(int)


In [47]:
# Walmart: подготовим матрицы
Xw_train_mat = preprocess_wm.fit_transform(Xw_train)
Xw_test_mat  = preprocess_wm.transform(Xw_test)

lin_gd = LinearRegressionGD(lr=0.05, n_iter=3000, l2=0.0)
lin_gd.fit(Xw_train_mat, yw_train)
yw_pred_gd = lin_gd.predict(Xw_test_mat)

mae_gd = mean_absolute_error(yw_test, yw_pred_gd)
rmse_gd = np.sqrt(mean_squared_error(yw_test, yw_pred_gd))
r2_gd = r2_score(yw_test, yw_pred_gd)

print("=== Walmart SCRATCH LinearRegressionGD ===")
print(f"MAE : {mae_gd:.4f}")
print(f"RMSE: {rmse_gd:.4f}")
print(f"R^2 : {r2_gd:.4f}")

# Spotify: матрицы
Xs_train_mat = preprocess_sp.fit_transform(Xs_train)
Xs_test_mat  = preprocess_sp.transform(Xs_test)

log_gd = LogisticRegressionGD(lr=0.05, n_iter=5000, l2=0.0)
log_gd.fit(Xs_train_mat, ys_train)

ys_pred_gd = log_gd.predict(Xs_test_mat)
ys_proba_gd = log_gd.predict_proba(Xs_test_mat)

acc_gd = accuracy_score(ys_test, ys_pred_gd)
prec_gd = precision_score(ys_test, ys_pred_gd, zero_division=0)
rec_gd = recall_score(ys_test, ys_pred_gd, zero_division=0)
f1_gd = f1_score(ys_test, ys_pred_gd, zero_division=0)
auc_gd = roc_auc_score(ys_test, ys_proba_gd)

print("\n=== Spotify SCRATCH LogisticRegressionGD ===")
print(f"Accuracy : {acc_gd:.4f}")
print(f"Precision: {prec_gd:.4f}")
print(f"Recall   : {rec_gd:.4f}")
print(f"F1-score : {f1_gd:.4f}")
print(f"ROC-AUC  : {auc_gd:.4f}")


=== Walmart SCRATCH LinearRegressionGD ===
MAE : 433540.6482
RMSE: 523876.6607
R^2 : 0.1481

=== Spotify SCRATCH LogisticRegressionGD ===
Accuracy : 0.7412
Precision: 0.0000
Recall   : 0.0000
F1-score : 0.0000
ROC-AUC  : 0.4981


In [48]:
# Walmart: L2
lin_gd_l2 = LinearRegressionGD(lr=0.05, n_iter=3000, l2=1e-3)
lin_gd_l2.fit(Xw_train_mat, yw_train)
yw_pred_gd_l2 = lin_gd_l2.predict(Xw_test_mat)

mae_gd_l2 = mean_absolute_error(yw_test, yw_pred_gd_l2)
rmse_gd_l2 = np.sqrt(mean_squared_error(yw_test, yw_pred_gd_l2))
r2_gd_l2 = r2_score(yw_test, yw_pred_gd_l2)

print("=== Walmart SCRATCH LinearRegressionGD (L2) ===")
print(f"MAE : {mae_gd_l2:.4f}")
print(f"RMSE: {rmse_gd_l2:.4f}")
print(f"R^2 : {r2_gd_l2:.4f}")

# Spotify: L2
log_gd_l2 = LogisticRegressionGD(lr=0.05, n_iter=5000, l2=1e-3)
log_gd_l2.fit(Xs_train_mat, ys_train)

ys_pred_gd_l2 = log_gd_l2.predict(Xs_test_mat)
ys_proba_gd_l2 = log_gd_l2.predict_proba(Xs_test_mat)

acc_gd_l2 = accuracy_score(ys_test, ys_pred_gd_l2)
prec_gd_l2 = precision_score(ys_test, ys_pred_gd_l2, zero_division=0)
rec_gd_l2 = recall_score(ys_test, ys_pred_gd_l2, zero_division=0)
f1_gd_l2 = f1_score(ys_test, ys_pred_gd_l2, zero_division=0)
auc_gd_l2 = roc_auc_score(ys_test, ys_proba_gd_l2)

print("\n=== Spotify SCRATCH LogisticRegressionGD (L2) ===")
print(f"Accuracy : {acc_gd_l2:.4f}")
print(f"Precision: {prec_gd_l2:.4f}")
print(f"Recall   : {rec_gd_l2:.4f}")
print(f"F1-score : {f1_gd_l2:.4f}")
print(f"ROC-AUC  : {auc_gd_l2:.4f}")


=== Walmart SCRATCH LinearRegressionGD (L2) ===
MAE : 433554.7061
RMSE: 523878.3771
R^2 : 0.1481

=== Spotify SCRATCH LogisticRegressionGD (L2) ===
Accuracy : 0.7412
Precision: 0.0000
Recall   : 0.0000
F1-score : 0.0000
ROC-AUC  : 0.4977


In [49]:
print("===== FINAL SUMMARY LAB 2 =====")

print("\nWalmart (Regression):")
print(f"SKLEARN Linear baseline  RMSE={rmse_lr_base:.4f}, R2={r2_lr_base:.4f}")
print(f"SKLEARN Improved         RMSE={rmse_lr_imp:.4f}, R2={r2_lr_imp:.4f}")
print(f"SCRATCH GD baseline      RMSE={rmse_gd:.4f}, R2={r2_gd:.4f}")
print(f"SCRATCH GD + L2          RMSE={rmse_gd_l2:.4f}, R2={r2_gd_l2:.4f}")

print("\nSpotify (Classification):")
print(f"SKLEARN LogReg baseline  F1={f1_base:.4f}, AUC={auc_base:.4f}")
print(f"SKLEARN Improved         F1={f1_imp:.4f}, AUC={auc_imp:.4f}")
print(f"SCRATCH GD baseline      F1={f1_gd:.4f}, AUC={auc_gd:.4f}")
print(f"SCRATCH GD + L2          F1={f1_gd_l2:.4f}, AUC={auc_gd_l2:.4f}")


===== FINAL SUMMARY LAB 2 =====

Walmart (Regression):
SKLEARN Linear baseline  RMSE=524119.5584, R2=0.1473
SKLEARN Improved         RMSE=523813.2528, R2=0.1483
SCRATCH GD baseline      RMSE=523876.6607, R2=0.1481
SCRATCH GD + L2          RMSE=523878.3771, R2=0.1481

Spotify (Classification):
SKLEARN LogReg baseline  F1=0.0000, AUC=0.4982
SKLEARN Improved         F1=0.3359, AUC=0.4965
SCRATCH GD baseline      F1=0.0000, AUC=0.4981
SCRATCH GD + L2          F1=0.0000, AUC=0.4977


В ходе лабораторной работы были исследованы алгоритмы линейной и логистической регрессии на реальных данных. Линейная регрессия показала устойчивые результаты в задаче прогнозирования продаж Walmart, а использование регуляризации (Ridge/Lasso) позволило улучшить обобщающую способность модели.

Логистическая регрессия показала конкурентоспособные результаты на задаче прогнозирования оттока пользователей Spotify. Подбор коэффициента регуляризации C и использование class_weight="balanced" (при необходимости) позволили увеличить метрики F1-score и ROC-AUC.

Дополнительно были реализованы обе модели “с нуля” через градиентный спуск, что подтвердило понимание принципов работы алгоритмов и позволило сравнить результаты с реализациями sklearn.