In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_percentage_error
import re
import warnings
warnings.filterwarnings('ignore')

In [None]:
# ============================================================================
# 1. ЗАГРУЗКА ДАННЫХ
# ============================================================================

print("Загрузка данных...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Размер train: {train_df.shape}")
print(f"Размер test: {test_df.shape}")

In [None]:
# ============================================================================
# 2. ФУНКЦИИ ДЛЯ ИЗВЛЕЧЕНИЯ ПРИЗНАКОВ
# ============================================================================

def extract_hp(engine_str):
    """Извлечение лошадиных сил из строки engine"""
    if pd.isna(engine_str) or engine_str == '':
        return np.nan
    match = re.search(r'(\d+\.?\d*)\s*HP', str(engine_str))
    if match:
        return float(match.group(1))
    return np.nan

def extract_engine_volume(engine_str):
    """Извлечение объема двигателя из строки engine"""
    if pd.isna(engine_str) or engine_str == '':
        return np.nan
    match = re.search(r'(\d+\.?\d*)\s*[L|l]\s', str(engine_str))
    if match:
        return float(match.group(1))
    return np.nan

def extract_cylinders(engine_str):
    """Извлечение количества цилиндров из строки engine"""
    if pd.isna(engine_str) or engine_str == '':
        return np.nan
    match = re.search(r'(\d+)\s*(?:Cylinder|V\d|Straight)', str(engine_str))
    if match:
        cyl_num = match.group(1)
        if cyl_num.isdigit():
            return int(cyl_num)
    return np.nan

def clean_mileage(mileage_str):
    """Очистка значения пробега - удаление запятых и 'mi.'"""
    if pd.isna(mileage_str) or mileage_str == '':
        return np.nan
    mileage_str = str(mileage_str).replace(',', '').replace(' mi.', '')
    try:
        return float(mileage_str)
    except:
        return np.nan


In [None]:
# ============================================================================
# 3. ОБРАБОТКА И ПОДГОТОВКА ДАННЫХ
# ============================================================================

# Извлечение признаков из engine строки
for df in [train_df, test_df]:
    df['hp'] = df['engine'].apply(extract_hp)
    df['engine_volume'] = df['engine'].apply(extract_engine_volume)
    df['cylinders'] = df['engine'].apply(extract_cylinders)
    df['mileage'] = df['milage'].apply(clean_mileage)

# Заполнение пропущенных значений средними
for df in [train_df, test_df]:
    df['hp'].fillna(df['hp'].mean(), inplace=True)
    df['engine_volume'].fillna(df['engine_volume'].mean(), inplace=True)
    df['cylinders'].fillna(df['cylinders'].mean(), inplace=True)

# Заполнение категориальных признаков
train_df['fuel_type'].fillna('Unknown', inplace=True)
test_df['fuel_type'].fillna('Unknown', inplace=True)

train_df['accident'].fillna('Unknown', inplace=True)
test_df['accident'].fillna('Unknown', inplace=True)

train_df['clean_title'].fillna('Unknown', inplace=True)
test_df['clean_title'].fillna('Unknown', inplace=True)

# Обработка цветов
for df in [train_df, test_df]:
    df['ext_col'] = df['ext_col'].str.replace('–', 'Unknown', regex=False)
    df['int_col'] = df['int_col'].str.replace('–', 'Unknown', regex=False)
    df['ext_col'].fillna('Unknown', inplace=True)
    df['int_col'].fillna('Unknown', inplace=True)


In [None]:
# ============================================================================
# 4. КОДИРОВАНИЕ КАТЕГОРИАЛЬНЫХ ПЕРЕМЕННЫХ
# ============================================================================

categorical_cols = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

combined_df = pd.concat([train_df[categorical_cols], test_df[categorical_cols]], ignore_index=True)

le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(combined_df[col])
    le_dict[col] = le

for col in categorical_cols:
    train_df[col + '_encoded'] = le_dict[col].transform(train_df[col])
    test_df[col + '_encoded'] = le_dict[col].transform(test_df[col])

In [None]:
# ============================================================================
# 5. ПОДГОТОВКА ПРИЗНАКОВ
# ============================================================================

feature_cols = ['model_year', 'mileage', 'hp', 'engine_volume', 'cylinders', 
                'brand_encoded', 'fuel_type_encoded', 'transmission_encoded', 
                'ext_col_encoded', 'int_col_encoded', 'accident_encoded', 'clean_title_encoded']

X_train = train_df[feature_cols].copy()
y_train = train_df['price'].copy()
X_test = test_df[feature_cols].copy()

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# ============================================================================
# 6. РАЗДЕЛЕНИЕ НА TRAIN/VALIDATION ДЛЯ ПРОВЕРКИ
# ============================================================================

#X_tr, X_val, y_tr, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

In [None]:
# ============================================================================
# 7. ОБУЧЕНИЕ МОДЕЛИ
# ============================================================================

# Обучаем на тренировочном наборе для проверки
#gb_check = GradientBoostingRegressor(
#    n_estimators=300,
#    learning_rate=0.02,
#    max_depth=8,
#    subsample=0.80,
#    random_state=42,
#    min_samples_split=3,
#    min_samples_leaf=1
#)

#gb_check.fit(X_tr, y_tr)

# Проверяем MAPE на валидации
#y_val_pred = gb_check.predict(X_val)
#val_mape = mean_absolute_percentage_error(y_val, y_val_pred)
#print(f"MAPE на валидации: {val_mape:.4f}")

In [None]:
# ============================================================================
# 8. ОБУЧЕНИЕ НА ПОЛНОМ ДАТАСЕТЕ И ПРЕДСКАЗАНИЯ
# ============================================================================

gb_final = GradientBoostingRegressor(
    n_estimators=600,              # ← 300 → 400
    learning_rate=0.02,            # ← 0.03 → 0.02
    max_depth=8,
    subsample=1.0,                 # ← 0.85 → 1.0 (ГЛАВНОЕ!)
    init='zero',                   # ← НОВОЕ! (ГЛАВНОЕ!)
    random_state=42,
    min_samples_split=3
)


gb_final.fit(X_train_scaled, y_train)

# Делаем предсказания на test
y_test_pred = gb_final.predict(X_test_scaled)

print(f"\nСтатистика предсказаний:")
print(f"Min: ${y_test_pred.min():.2f}")
print(f"Max: ${y_test_pred.max():.2f}")
print(f"Mean: ${y_test_pred.mean():.2f}")
print(f"Median: ${np.median(y_test_pred):.2f}")

In [None]:
# ============================================================================
# 9. СОХРАНЕНИЕ РЕЗУЛЬТАТОВ
# ============================================================================

submission = pd.DataFrame({
    'id': test_df['id'].values,
    'price': y_test_pred
})

submission.to_csv('submission_final.csv', index=False)

In [5]:
# ============================================================================
# 10. АНАЛИЗ ВАЖНОСТИ ПРИЗНАКОВ
# ============================================================================

print("\nИмпортантность признаков:")
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': gb_final.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

print("\n" + "="*60)
print("="*60)


Загрузка данных...
Размер train: (3009, 13)
Размер test: (1000, 12)

Обработка данных...
Кодирование категориальных переменных...
Подготовка матриц признаков...

Обучение модели Gradient Boosting...
MAPE на валидации: 0.2889

Обучение финальной модели на полных тренировочных данных...

Статистика предсказаний:
Min: $4136.69
Max: $446306.40
Mean: $42889.65
Median: $31882.86

Сохранение результатов...
Файл submission_final.csv успешно сохранен!

Импортантность признаков:
                 feature  importance
1                mileage    0.332198
3          engine_volume    0.226322
2                     hp    0.113916
5          brand_encoded    0.083822
9        int_col_encoded    0.079727
0             model_year    0.048256
7   transmission_encoded    0.033697
4              cylinders    0.030865
8        ext_col_encoded    0.030015
6      fuel_type_encoded    0.012148
11   clean_title_encoded    0.004655
10      accident_encoded    0.004378

✓ Программа завершена успешно!
