# Загрузка данных

In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

train_path = '/kaggle/input/price-prediction-hw7/Train.csv'
test_path = '/kaggle/input/price-prediction-hw7/Test.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Подготовка данных

In [47]:
def prepare_features(df, is_train=True):
    df = df.copy()
    
    kw_cols = [f'kw{i}' for i in range(1, 14) if f'kw{i}' in df.columns]
    df.drop(columns=kw_cols, inplace=True)
    
    # Заполнение пропусков
    num_cols = ['metro_dist', 'area', 'rooms', 'floor', 'balcon', 'n_photos']
    for col in num_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
    
    # Категориальные признаки
    cat_cols = ['street_id', 'build_tech', 'g_lift']
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    cols_to_drop = ['date']
    if not is_train:
        cols_to_drop = [col for col in cols_to_drop if col != 'id']
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
    
    return df

In [48]:
train_proc = prepare_features(train, is_train=True)
test_proc = prepare_features(test, is_train=False)

In [None]:
test_ids = test_proc['id']
test_features = test_proc[features_order]

cat_features = ['street_id', 'build_tech', 'g_lift']
cat_features_idx = [i for i, col in enumerate(features_order) if col in cat_features]

# Кросс-валидация

In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
mae_scores = []
test_preds = np.zeros(len(test_features))

params = {
    'iterations': 1500,
    'learning_rate': 0.03,
    'depth': 7,
    'l2_leaf_reg': 3,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'random_seed': 42,
    'od_type': 'Iter',
    'od_wait': 50,
    'verbose': 100
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}/{n_folds}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
    val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)
    
    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    
    val_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    mae_scores.append(mae)
    print(f"Fold {fold + 1} MAE: {mae:.2f}")
    
    test_preds += model.predict(test_features) / n_folds

print(f"\nMean MAE across folds: {np.mean(mae_scores):.2f} ± {np.std(mae_scores):.2f}")


Fold 1/5
0:	learn: 2520962.6617251	test: 2508371.1060029	best: 2508371.1060029 (0)	total: 75ms	remaining: 1m 52s
100:	learn: 1142145.9940669	test: 1058402.4930366	best: 1058402.4930366 (100)	total: 4.52s	remaining: 1m 2s
200:	learn: 1006821.5964148	test: 919025.6451120	best: 919025.6451120 (200)	total: 9.64s	remaining: 1m 2s
300:	learn: 944306.6192872	test: 857089.9971352	best: 857089.9971352 (300)	total: 14.2s	remaining: 56.7s
400:	learn: 902664.3935565	test: 815104.0007024	best: 815104.0007024 (400)	total: 19.1s	remaining: 52.4s
500:	learn: 872924.7510265	test: 785823.5596042	best: 785823.5596042 (500)	total: 24s	remaining: 47.8s
600:	learn: 848840.2447848	test: 762276.8773409	best: 762276.8773409 (600)	total: 28.9s	remaining: 43.2s
700:	learn: 830701.8162375	test: 746381.1434721	best: 746381.1434721 (700)	total: 34.2s	remaining: 39s
800:	learn: 814899.6018578	test: 733458.2389958	best: 733458.2389958 (800)	total: 38.7s	remaining: 33.8s
900:	learn: 802784.5252764	test: 723308.930811

# Сабмит

In [53]:
submission = pd.DataFrame({'id': test_ids, 'price': test_preds})
submission.to_csv('submission.csv', index=False)