# ЛР 2 - линейная и логистическая регрессии

In [14]:
import kagglehub
from pathlib import Path

In [15]:
path = kagglehub.dataset_download("uciml/forest-cover-type-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
list(data_dir.glob("**/*"))[:20]

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/uciml/forest-cover-type-dataset/versions/1


[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/uciml/forest-cover-type-dataset/versions/1/covtype.csv')]

In [16]:
csv_files = list(data_dir.glob("**/*.csv"))
df = pd.read_csv(csv_files[0])
df.shape, df.head()

((581012, 55),
    Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
 0       2596      51      3                               258   
 1       2590      56      2                               212   
 2       2804     139      9                               268   
 3       2785     155     18                               242   
 4       2595      45      2                               153   
 
    Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
 0                               0                              510   
 1                              -6                              390   
 2                              65                             3180   
 3                             118                             3090   
 4                              -1                              391   
 
    Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
 0            221             232            148   
 1            220             235            151   
 2 

### Данные разделены на обучающую и тестовую выборки со стратификацией по классам

In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

target_col = "Cover_Type"

X = df.drop(columns=[target_col])
y = df[target_col]

MAX_ROWS = 90_000
if len(df) > MAX_ROWS:
    df_sample = df.sample(n=MAX_ROWS, random_state=42)
    X = df_sample.drop(columns=[target_col])
    y = df_sample[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.value_counts().sort_index()

((72000, 54),
 (18000, 54),
 Cover_Type
 1    26377
 2    35049
 3     4418
 4      320
 5     1213
 6     2121
 7     2502
 Name: count, dtype: int64)

### Построен бейзлайн логистической регрессии с масштабированием признаков

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

clf_lr_baseline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
        random_state=42
    ))
])

clf_lr_baseline

### Качество оценено метриками Accuracy и Macro-F1

In [23]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

clf_lr_baseline.fit(X_train, y_train)
pred = clf_lr_baseline.predict(X_test)

acc = accuracy_score(y_test, pred)
f1m = f1_score(y_test, pred, average="macro")

print(f"LogReg baseline Accuracy: {acc:.4f}")
print(f"LogReg baseline Macro-F1: {f1m:.4f}")
print("\nClassification report:\n", classification_report(y_test, pred))

LogReg baseline Accuracy: 0.7248
LogReg baseline Macro-F1: 0.5273

Classification report:
               precision    recall  f1-score   support

           1       0.71      0.70      0.71      6595
           2       0.75      0.80      0.77      8762
           3       0.67      0.82      0.74      1104
           4       0.68      0.40      0.50        80
           5       0.11      0.00      0.01       304
           6       0.51      0.24      0.32       530
           7       0.71      0.58      0.64       625

    accuracy                           0.72     18000
   macro avg       0.59      0.51      0.53     18000
weighted avg       0.71      0.72      0.71     18000



### Улучшение GridSearchCV

In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "lr__C": [0.01, 0.1, 1.0, 5.0, 10.0],
    "lr__penalty": ["l2"],
    "lr__solver": ["lbfgs", "saga"],
}

grid = GridSearchCV(
    estimator=clf_lr_baseline,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV Macro-F1:", grid.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best params: {'lr__C': 10.0, 'lr__penalty': 'l2', 'lr__solver': 'lbfgs'}
Best CV Macro-F1: 0.5273986128892395


### Оценка улучшенной модели
Лучшие параметры оценены на тестовой выборке и сопоставлены с бейзлайном

In [27]:
best_lr = grid.best_estimator_
pred_best = best_lr.predict(X_test)

acc_best = accuracy_score(y_test, pred_best)
f1m_best = f1_score(y_test, pred_best, average="macro")

print(f"LogReg improved Accuracy: {acc_best:.4f}")
print(f"LogReg improved Macro-F1: {f1m_best:.4f}")
print("\nClassification report:\n", classification_report(y_test, pred_best))

LogReg improved Accuracy: 0.7251
LogReg improved Macro-F1: 0.5287

Classification report:
               precision    recall  f1-score   support

           1       0.71      0.70      0.71      6595
           2       0.75      0.80      0.77      8762
           3       0.68      0.82      0.74      1104
           4       0.66      0.41      0.51        80
           5       0.11      0.00      0.01       304
           6       0.51      0.24      0.32       530
           7       0.71      0.59      0.64       625

    accuracy                           0.73     18000
   macro avg       0.59      0.51      0.53     18000
weighted avg       0.71      0.73      0.71     18000



## Регрессия: Linear Regression (Car Prices)

In [28]:
path = kagglehub.dataset_download("sidharth178/car-prices-dataset")
print("Path to dataset files:", path)

data_dir = Path(path)
list(data_dir.glob("**/*"))[:30]

Path to dataset files: /Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1


[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/test.csv'),
 PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/train.csv')]

In [29]:
csv_files = list(data_dir.glob("**/*.csv"))
csv_files

[PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/test.csv'),
 PosixPath('/Users/rkoyunusov/.cache/kagglehub/datasets/sidharth178/car-prices-dataset/versions/1/train.csv')]

In [30]:
df = pd.read_csv(csv_files[1])
df.shape, df.head()

((19237, 18),
          ID  Price  Levy Manufacturer    Model  Prod. year   Category  \
 0  45654403  13328  1399        LEXUS   RX 450        2010       Jeep   
 1  44731507  16621  1018    CHEVROLET  Equinox        2011       Jeep   
 2  45774419   8467     -        HONDA      FIT        2006  Hatchback   
 3  45769185   3607   862         FORD   Escape        2011       Jeep   
 4  45809263  11726   446        HONDA      FIT        2014  Hatchback   
 
   Leather interior Fuel type Engine volume    Mileage  Cylinders  \
 0              Yes    Hybrid           3.5  186005 km        6.0   
 1               No    Petrol             3  192000 km        6.0   
 2               No    Petrol           1.3  200000 km        4.0   
 3              Yes    Hybrid           2.5  168966 km        4.0   
 4              Yes    Petrol           1.3   91901 km        4.0   
 
   Gear box type Drive wheels   Doors             Wheel   Color  Airbags  
 0     Automatic          4x4  04-May        Left

In [31]:
target_col = "Price"
df = df.dropna(subset=[target_col]).copy()

X = df.drop(columns=[target_col])
y = df[target_col]

df.shape, X.shape, y.shape

((19237, 18), (19237, 17), (19237,))

### Парсинг и загрузка взяты из ЛР1

In [32]:
X["Mileage"] = (
    X["Mileage"].astype(str)
    .str.replace(" km", "", regex=False)
)
X["Mileage"] = pd.to_numeric(X["Mileage"], errors="coerce")

X["Engine volume"] = (
    X["Engine volume"].astype(str)
    .str.extract(r"([\d\.]+)")[0]
)
X["Engine volume"] = pd.to_numeric(X["Engine volume"], errors="coerce")

mask = X.notna().all(axis=1)
X = X.loc[mask]
y = y.loc[mask]

X.shape, y.shape

((19237, 17), (19237,))

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape

((15389, 17), (3848, 17))

### Препроцессинг

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

len(num_cols), len(cat_cols)

(6, 11)

### Baseline LinearRegression
Построен бейзлайн линейной регрессии с масштабированием числовых признаков и one-hot кодированием категориальных

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

linreg_baseline = Pipeline(steps=[
    ("prep", preprocess),
    ("lr", LinearRegression())
])

linreg_baseline.fit(X_train, y_train)
pred = linreg_baseline.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))

print(f"LinearRegression baseline MAE:  {mae:.4f}")
print(f"LinearRegression baseline RMSE: {rmse:.4f}")

LinearRegression baseline MAE:  16160.5612
LinearRegression baseline RMSE: 59989.6301


### Регуляризация

In [39]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "lr": [Ridge()],
        "lr__alpha": [0.1, 1.0, 10.0, 50.0]
    },
    {
        "lr": [Lasso(max_iter=5000)],
        "lr__alpha": [0.001, 0.01, 0.1]
    },
    {
        "lr": [ElasticNet(max_iter=5000)],
        "lr__alpha": [0.001, 0.01, 0.1],
        "lr__l1_ratio": [0.2, 0.5, 0.8]
    }
]

grid = GridSearchCV(
    estimator=Pipeline([("prep", preprocess), ("lr", Ridge())]),
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best estimator:", type(grid.best_estimator_.named_steps["lr"]).__name__)
print("Best params:", grid.best_params_)
print("Best CV MAE:", -grid.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Best estimator: ElasticNet
Best params: {'lr': ElasticNet(max_iter=5000), 'lr__alpha': 0.1, 'lr__l1_ratio': 0.2}
Best CV MAE: 13397.732286103615


### Оценка улучшенной модели

In [42]:
best_linreg = grid.best_estimator_
pred_best = best_linreg.predict(X_test)

mae_best = mean_absolute_error(y_test, pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, pred_best))

print(f"Linear (improved) MAE:  {mae_best:.4f}")
print(f"Linear (improved) RMSE: {rmse_best:.4f}")

Linear (improved) MAE:  11087.1567
Linear (improved) RMSE: 16845.6116


### Сводная таблица

In [43]:
summary_lr2_reg = pd.DataFrame({
    "Model": ["LinearRegression (baseline)", "Linear model (improved)"],
    "MAE": [mae, mae_best],
    "RMSE": [rmse, rmse_best],
})

summary_lr2_reg

Unnamed: 0,Model,MAE,RMSE
0,LinearRegression (baseline),16160.56122,59989.630109
1,Linear model (improved),11087.156655,16845.611578


Бейзлайновая линейная регрессия показала низкое качество, что обусловлено сложной и нелинейной зависимостью между признаками и стоимостью автомобиля. Использование регуляризации позволило существенно улучшить результаты и снизить влияние мультиколлинеарности и выбросов. Тем не менее, по качеству линейные модели уступают алгоритму KNN, что подтверждает преимущество нелинейных методов для данной задачи