In [None]:
# Импорт библиотек
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Настройка отображения 
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

# Проверка доступности GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Используемое устройство: {device}")

In [None]:
# Загрузка данных
train_df = pd.read_csv('/kaggle/input/playground-series-s5e10/train.csv')

print(f"Размер датасета: {train_df.shape}")
print(f"\nПервые строки:")
print(train_df.head())
print(f"\nИнформация о данных:")
print(train_df.info())
print(f"\nПропущенные значения:")
print(train_df.isnull().sum())


In [None]:
# Подготовка данных для нейросети

# Разделение на признаки и целевую переменную
X = train_df.drop(['id', 'accident_risk'], axis=1)
y = train_df['accident_risk'].values

print("Типы признаков:")
print(X.dtypes)

# Определяем категориальные и числовые признаки
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nКатегориальные признаки: {categorical_features}")
print(f"Числовые признаки: {numerical_features}")

# Кодирование категориальных признаков с помощью Label Encoding
X_encoded = X.copy()
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    label_encoders[col] = le

print(f"\nРазмер после кодирования: {X_encoded.shape}")
print(f"Количество признаков: {X_encoded.shape[1]}")

In [None]:
# Нормализация числовых признаков
scaler = StandardScaler()
X_scaled = X_encoded.copy()
X_scaled[numerical_features] = scaler.fit_transform(X_encoded[numerical_features])

print("Данные после нормализации:")
print(X_scaled.head())
print(f"\nСтатистика числовых признаков после нормализации:")
print(X_scaled[numerical_features].describe())

In [None]:
# Разделение на train и validation
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, 
    test_size=0.2, 
    random_state=42
)

print(f"Train размер: {X_train.shape}")
print(f"Validation размер: {X_val.shape}")

# Конвертация в numpy arrays
X_train_np = X_train.values.astype(np.float32)
X_val_np = X_val.values.astype(np.float32)
y_train_np = y_train.astype(np.float32).reshape(-1, 1)
y_val_np = y_val.astype(np.float32).reshape(-1, 1)


In [None]:
# Определение архитектуры нейросети
class AccidentRiskPredictor(nn.Module):
    def __init__(self, input_dim):
        super(AccidentRiskPredictor, self).__init__()
        
        self.network = nn.Sequential(
            # Input layer 
            nn.Linear(input_dim, 512), 
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1),  
            
            nn.Linear(512, 256), 
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(256, 128), 
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.05),
            
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            # Без dropout на последних слоях
            
            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.network(x)

# Создание модели
input_dim = X_train_np.shape[1]
model = AccidentRiskPredictor(input_dim).to(device)

print(f"\nКоличество параметров: {sum(p.numel() for p in model.parameters())}")


In [None]:
# Настройка обучения
criterion = nn.MSELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=15, factor=0.5, verbose=True)

# Параметры обучения
batch_size = 512
num_epochs = 200
best_val_loss = float('inf')
patience = 50
patience_counter = 0

# Создание DataLoader для батчевого обучения
train_dataset = TensorDataset(
    torch.FloatTensor(X_train_np), 
    torch.FloatTensor(y_train_np)
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# Для валидации
X_val_tensor = torch.FloatTensor(X_val_np).to(device)
y_val_tensor = torch.FloatTensor(y_val_np).to(device)

print(f"Параметры обучения:")
print(f"Batch size: {batch_size}")
print(f"Количество эпох: {num_epochs}")
print(f"Learning rate: 0.001")
print(f"Оптимизатор: Adam")
print(f"Функция потерь: MSE")
print(f"Device: {device}")


In [None]:
# ОБУЧЕНИЕ НЕЙРОСЕТИ 
train_losses = []
val_losses = []

print("ОБУЧЕНИЕ НЕЙРОСЕТИ")
print("Начало обучения...\n")

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    
    for batch_X, batch_y in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * batch_X.size(0)
    
    train_loss = train_loss / len(train_loader.dataset)
    train_losses.append(train_loss)
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)
        val_losses.append(val_loss.item())
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Вывод прогресса
    if (epoch + 1) % 10 == 0:
        train_rmse = np.sqrt(train_loss)
        val_rmse = np.sqrt(val_loss.item())
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'  Train Loss: {train_loss:.6f} (RMSE: {train_rmse:.6f})')
        print(f'  Val Loss: {val_loss.item():.6f} (RMSE: {val_rmse:.6f})')
        print(f'  LR: {optimizer.param_groups[0]["lr"]:.6f}\n')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = model.state_dict().copy()
    else:
        patience_counter += 1
        
    if patience_counter >= patience:
        print(f'Early stopping на эпохе {epoch+1}')
        model.load_state_dict(best_model_state)
        break

print("Обучение нейросети завершено!")
print(f"Лучший Val Loss: {best_val_loss:.6f} (RMSE: {np.sqrt(best_val_loss.item()):.6f})")

In [None]:
# ОБУЧЕНИЕ XGBOOST 
import xgboost as xgb

print("ОБУЧЕНИЕ XGBOOST МОДЕЛИ")

# Используем уже закодированные данные X_encoded (до нормализации)
X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = train_test_split(
    X_encoded, y, 
    test_size=0.2, 
    random_state=42
)

# Обучение XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    max_depth=8,           
    learning_rate=0.05,   
    
    # Умеренная регуляризация
    reg_alpha=0.01,
    reg_lambda=0.1,
    gamma=0.01,
    
    # Оптимальное сэмплирование
    subsample=0.9,
    colsample_bytree=0.9,
    
    # Простые настройки
    min_child_weight=1,
    tree_method='hist',
    
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50
)

print("\nНачало обучения XGBoost...")
xgb_model.fit(
    X_train_xgb, y_train_xgb,
    eval_set=[(X_train_xgb, y_train_xgb), (X_val_xgb, y_val_xgb)],
    verbose=50
)

# Предсказания XGBoost
y_train_pred_xgb = xgb_model.predict(X_train_xgb)
y_val_pred_xgb = xgb_model.predict(X_val_xgb)

# Метрики XGBoost
print("МЕТРИКИ XGBOOST:")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train_xgb, y_train_pred_xgb)):.6f}")
print(f"Val RMSE: {np.sqrt(mean_squared_error(y_val_xgb, y_val_pred_xgb)):.6f}")


In [None]:
# ПРЕДСКАЗАНИЯ НЕЙРОСЕТИ НА XGB DATA 

# Нормализуем данные XGBoost для нейросети
X_train_xgb_scaled = X_train_xgb.copy()
X_val_xgb_scaled = X_val_xgb.copy()

X_train_xgb_scaled[numerical_features] = scaler.transform(X_train_xgb[numerical_features])
X_val_xgb_scaled[numerical_features] = scaler.transform(X_val_xgb[numerical_features])

# Предсказания нейросети
X_train_xgb_np = X_train_xgb_scaled.values.astype(np.float32)
X_val_xgb_np = X_val_xgb_scaled.values.astype(np.float32)

model.eval()
with torch.no_grad():
    X_train_xgb_tensor = torch.FloatTensor(X_train_xgb_np).to(device)
    X_val_xgb_tensor = torch.FloatTensor(X_val_xgb_np).to(device)
    
    y_train_pred_nn = model(X_train_xgb_tensor).cpu().numpy().flatten()
    y_val_pred_nn = model(X_val_xgb_tensor).cpu().numpy().flatten()

print("МЕТРИКИ НЕЙРОСЕТИ:")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train_xgb, y_train_pred_nn)):.6f}")
print(f"Val RMSE: {np.sqrt(mean_squared_error(y_val_xgb, y_val_pred_nn)):.6f}")

In [None]:
# СОЗДАНИЕ АНСАМБЛЯ 

# Попробуем разные веса
weights_to_test = [
    (0.5, 0.5),   # Равные веса
    (0.6, 0.4),   # Больше XGBoost
    (0.7, 0.3),   # Еще больше XGBoost
    (0.4, 0.6),   # Больше NN
    (0.55, 0.45), # Немного больше XGBoost
    (0.65, 0.35),
    (0.45, 0.55),
]

print("ПОИСК ОПТИМАЛЬНЫХ ВЕСОВ АНСАМБЛЯ")

best_rmse = float('inf')
best_weights = None

for w_xgb, w_nn in weights_to_test:
    # Ансамбль на validation
    ensemble_val_pred = w_xgb * y_val_pred_xgb + w_nn * y_val_pred_nn
    ensemble_rmse = np.sqrt(mean_squared_error(y_val_xgb, ensemble_val_pred))
    
    print(f"Веса: XGB={w_xgb:.2f}, NN={w_nn:.2f} → RMSE: {ensemble_rmse:.6f}")
    
    if ensemble_rmse < best_rmse:
        best_rmse = ensemble_rmse
        best_weights = (w_xgb, w_nn)

print(f"ЛУЧШИЕ ВЕСА: XGB={best_weights[0]:.2f}, NN={best_weights[1]:.2f}")
print(f"ЛУЧШИЙ ENSEMBLE VAL RMSE: {best_rmse:.6f}")

# Создаем финальный ансамбль с лучшими весами
w_xgb_final, w_nn_final = best_weights
y_train_pred_ensemble = w_xgb_final * y_train_pred_xgb + w_nn_final * y_train_pred_nn
y_val_pred_ensemble = w_xgb_final * y_val_pred_xgb + w_nn_final * y_val_pred_nn

print("ФИНАЛЬНЫЕ МЕТРИКИ АНСАМБЛЯ")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train_xgb, y_train_pred_ensemble)):.6f}")
print(f"Val RMSE:   {np.sqrt(mean_squared_error(y_val_xgb, y_val_pred_ensemble)):.6f}")
print(f"Val R2:     {r2_score(y_val_xgb, y_val_pred_ensemble):.6f}")

In [None]:
# СРАВНИТЕЛЬНАЯ ТАБЛИЦА 

comparison = pd.DataFrame({
    'Модель': ['XGBoost', 'Нейросеть', 'Ансамбль'],
    'Train RMSE': [
        np.sqrt(mean_squared_error(y_train_xgb, y_train_pred_xgb)),
        np.sqrt(mean_squared_error(y_train_xgb, y_train_pred_nn)),
        np.sqrt(mean_squared_error(y_train_xgb, y_train_pred_ensemble))
    ],
    'Val RMSE': [
        np.sqrt(mean_squared_error(y_val_xgb, y_val_pred_xgb)),
        np.sqrt(mean_squared_error(y_val_xgb, y_val_pred_nn)),
        np.sqrt(mean_squared_error(y_val_xgb, y_val_pred_ensemble))
    ]
})

print("СРАВНЕНИЕ МОДЕЛЕЙ")
print(comparison.to_string(index=False))

# Улучшение ансамбля относительно лучшей модели
xgb_rmse = comparison.loc[0, 'Val RMSE']
ensemble_rmse = comparison.loc[2, 'Val RMSE']
improvement = ((xgb_rmse - ensemble_rmse) / xgb_rmse) * 100

print(f"\nУлучшение ансамбля относительно XGBoost: {improvement:.3f}%")
print(f"Веса: XGBoost={w_xgb_final:.1%}, Нейросеть={w_nn_final:.1%}")


In [None]:
# ПРЕДСКАЗАНИЕ НА TEST ДАННЫХ (АНСАМБЛЬ) 
try:
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e10/test.csv')
    print(f"Тестовый набор загружен: {test_df.shape}")
    
    # Сохраняем ID
    test_ids = test_df['id']
    
    # Подготовка тестовых данных
    X_test = test_df.drop(['id', 'accident_risk'], axis=1, errors='ignore')
    
    #  ПРЕДСКАЗАНИЕ XGBOOST 
    X_test_encoded = X_test.copy()
    for col in categorical_features:
        if col in X_test_encoded.columns:
            X_test_encoded[col] = label_encoders[col].transform(X_test_encoded[col].astype(str))
    
    test_pred_xgb = xgb_model.predict(X_test_encoded)
    
    #  ПРЕДСКАЗАНИЕ НЕЙРОСЕТИ 
    X_test_scaled = X_test_encoded.copy()
    X_test_scaled[numerical_features] = scaler.transform(X_test_encoded[numerical_features])
    
    X_test_np = X_test_scaled.values.astype(np.float32)
    X_test_tensor = torch.FloatTensor(X_test_np).to(device)
    
    model.eval()
    with torch.no_grad():
        test_pred_nn = model(X_test_tensor).cpu().numpy().flatten()
    
    # АНСАМБЛЬ 
    test_pred_ensemble = w_xgb_final * test_pred_xgb + w_nn_final * test_pred_nn
    
    # Создание submission файла
    submission = pd.DataFrame({
        'id': test_ids,
        'accident_risk': test_pred_ensemble
    })
    
    submission.to_csv('submission.csv', index=False)
    
    print("АНСАМБЛЬ: submission.csv создан!")
    print(f"\nРазмер: {submission.shape}")
    print(f"Веса: XGBoost={w_xgb_final:.1%}, Нейросеть={w_nn_final:.1%}")
    print(f"\nСтатистика предсказаний:")
    print(submission['accident_risk'].describe())
    
except FileNotFoundError:
    print("Файл test.csv не найден.")
