In [40]:
import pandas as pd
import numpy as np
import holidays

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate

In [41]:
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [62]:
train = train.dropna(subset=['num_sold'])

In [63]:
# 1. Разделение признаков и таргета

X = train.drop(columns=['id', 'num_sold'])
y = train['num_sold']

In [64]:
# 2. Обработка даты

X['date'] = pd.to_datetime(X['date'])
X['year'] = X['date'].dt.year
X['month'] = X['date'].dt.month
X['day'] = X['date'].dt.day
X['dayofweek'] = X['date'].dt.dayofweek
X = X.drop(columns=['date'])

In [65]:
# 3. Кодирование категориальных признаков (country, store, product)

X = pd.get_dummies(X, drop_first=True, dtype=int)
test['date'] = pd.to_datetime(test['date'])
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['dayofweek'] = test['date'].dt.dayofweek
test = test.drop(columns=['id', 'date'])
test = pd.get_dummies(test, drop_first=True, dtype=int)

In [66]:
# Выровняем train/test по колонкам
X, test = X.align(test, join="left", axis=1, fill_value=0)

In [67]:
# 4. Train / Test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# 5. Модели

# Линейная регрессия
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("Linear Regression R2:", r2_score(y_test, y_pred_lr))

Linear Regression RMSE: 312.074527088727
Linear Regression R2: 0.796235961004736


## Использую бустинг

In [72]:
# Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)

print("Ridge RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))
print("Ridge R2:", r2_score(y_test, y_pred_ridge))

Ridge RMSE: 312.07456147507696
Ridge R2: 0.7962359161007062


In [74]:
# Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("GBR RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_gb)))
print("GBR R2:", r2_score(y_test, y_pred_gb))

GBR RMSE: 167.17151632329544
GBR R2: 0.941529681178229


## Добовления submission.scv

In [78]:
# Финальная модель (например Gradient Boosting)
final_model = GradientBoostingRegressor(random_state=42)
final_model.fit(X, y)

# Предсказания на тесте
test_pred = final_model.predict(test)

# Загружаем оригинальный test.csv, чтобы взять id
test_original = pd.read_csv("data/test.csv")

# Формируем submission
submission = pd.DataFrame({
    "id": test_original["id"],
    "num_sold": test_pred
})

# Сохраняем
submission.to_csv("submission.csv", index=False)

print("submission.csv сохранён!")


submission.csv сохранён!
