In [None]:
# Импорт библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import xgboost as xgb


In [None]:
# Загрузка данных
train_df = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')

print(f"Размер датасета: {train_df.shape}")
print(f"\nПервые строки:")
train_df.head()

In [None]:
# Анализ целевой переменной
fig, axes = plt.subplots(1, 1, figsize=(14, 5))

axes.hist(train_df['accident_risk'], bins=50, edgecolor='black')
axes.set_title('Распределение вероятности аварии')
axes.set_xlabel('Вероятность аварии')
axes.set_ylabel('Частота')

plt.tight_layout()
plt.show()

print(f"Минимум: {train_df['accident_risk'].min()}")
print(f"Максимум: {train_df['accident_risk'].max()}")
print(f"Среднее: {train_df['accident_risk'].mean():.4f}")

In [None]:
# Разделение на признаки и целевую переменную
X = train_df.drop(['id', 'accident_risk'], axis=1)
y = train_df['accident_risk']

print("Типы признаков:")
print(X.dtypes)
print("\nКатегориальные признаки:")
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
print(categorical_features)
print("\nЧисловые признаки:")
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numerical_features)

In [None]:
# Разделение на train/val и подготовка для двух моделей
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Подготовка для CatBoost: приведение категориальных к str
X_train_cb = X_train.copy()
X_val_cb = X_val.copy()
for col in categorical_features:
    X_train_cb[col] = X_train_cb[col].astype(str)
    X_val_cb[col] = X_val_cb[col].astype(str)

# Подготовка для XGBoost: Label Encoding
X_train_xgb = X_train.copy()
X_val_xgb = X_val.copy()
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_train_xgb[col] = le.fit_transform(X_train_xgb[col].astype(str))
    X_val_xgb[col] = le.transform(X_val_xgb[col].astype(str))
    label_encoders[col] = le

print("Train/Val размеры:", X_train.shape, X_val.shape)


In [None]:
# Обучение CatBoost
cat_model = CatBoostRegressor(
    iterations=2000,
    depth=7,
    learning_rate=0.1,
    l2_leaf_reg=3,
    subsample=0.9,
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100,
    cat_features=categorical_features,
    grow_policy='Depthwise',
    bagging_temperature=0.5
)

print("[CatBoost] Обучение...")
cat_model.fit(
    X_train_cb, y_train,
    eval_set=(X_val_cb, y_val)
)

cat_val_pred = cat_model.predict(X_val_cb)
print("[CatBoost] Готово.")


In [None]:
# Обучение XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=1200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.01,
    reg_lambda=0.1,
    gamma=0.01,
    min_child_weight=1,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50
)

print("[XGBoost] Обучение...")
xgb_model.fit(
    X_train_xgb, y_train,
    eval_set=[(X_train_xgb, y_train), (X_val_xgb, y_val)],
    verbose=100
)

xgb_val_pred = xgb_model.predict(X_val_xgb)
print("[XGBoost] Готово.")


In [None]:
# Метрики отдельных моделей и простого ансамбля
print("МЕТРИКИ НА VALIDATION")

# CatBoost
cat_rmse = np.sqrt(mean_squared_error(y_val, cat_val_pred))
cat_mae = mean_absolute_error(y_val, cat_val_pred)
cat_r2 = r2_score(y_val, cat_val_pred)
print(f"CatBoost -> RMSE: {cat_rmse:.6f} | MAE: {cat_mae:.6f} | R2: {cat_r2:.6f}")

# XGBoost
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_pred))
xgb_mae = mean_absolute_error(y_val, xgb_val_pred)
xgb_r2 = r2_score(y_val, xgb_val_pred)
print(f"XGBoost  -> RMSE: {xgb_rmse:.6f} | MAE: {xgb_mae:.6f} | R2: {xgb_r2:.6f}")

# Ансамбль (простое усреднение)
ens_val_pred = (cat_val_pred + xgb_val_pred) / 2
ens_rmse = np.sqrt(mean_squared_error(y_val, ens_val_pred))
ens_mae = mean_absolute_error(y_val, ens_val_pred)
ens_r2 = r2_score(y_val, ens_val_pred)
print(f"Ensemble -> RMSE: {ens_rmse:.6f} | MAE: {ens_mae:.6f} | R2: {ens_r2:.6f}")

if ens_rmse < min(cat_rmse, xgb_rmse):
    print("\nАнсамбль улучшает качество относительно обеих моделей.")
else:
    print("\nАнсамбль не улучшил RMSE относительно лучшей одиночной модели.")


In [None]:
# Submission: ансамбль CatBoost + XGBoost
try:
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
    print(f"Тестовый набор загружен: {test_df.shape}")

    # Сохраняем ID
    test_ids = test_df['id']

    # Подготовка тестовых данных
    X_test_cb = test_df.drop(['id'], axis=1).copy()
    X_test_xgb = X_test_cb.copy()

    # CatBoost: приведение категориальных к str
    for col in categorical_features:
        if col in X_test_cb.columns:
            X_test_cb[col] = X_test_cb[col].astype(str)

    # XGBoost: Label Encoding теми же энкодерами
    for col in categorical_features:
        if col in X_test_xgb.columns:
            X_test_xgb[col] = label_encoders[col].transform(X_test_xgb[col].astype(str))

    # Предсказания
    cat_test_pred = cat_model.predict(X_test_cb)
    xgb_test_pred = xgb_model.predict(X_test_xgb)

    # Ансамбль: простое усреднение
    ensemble_test_pred = (cat_test_pred + xgb_test_pred) / 2

    submission = pd.DataFrame({
        'id': test_ids,
        'accident_risk': ensemble_test_pred
    })

    submission.to_csv('submission.csv', index=False)
    print("Ансамбль CatBoost+XGBoost сохранён в submission.csv")

    print("\nПервые строки submission:")
    print(submission.head(10))
    print("\nСтатистика предсказаний:")
    print(submission['accident_risk'].describe())

except FileNotFoundError:
    print("Файл test.csv не найден. Пропускаем создание submission.") 
