In [1]:
import numpy as np
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

## Считывание данных

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
submission = pd.read_csv('sample_submission.csv', index_col='id')

In [3]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

## Построение моделей

По сравнению с другими классическими методами Random Forest Regressor показал наилучший результат. Дополнительно настроив параметры, получилось добиться ошибки=0.70395.  

In [7]:
#Random Forest Regressor
model_rand_forest = RandomForestRegressor(n_estimators=140, max_features=4, n_jobs=-1, min_samples_leaf=15, random_state = 15)
model_rand_forest.fit(X_train, y_train)
y_rand_forest = model_rand_forest.predict(X_test)
score_rand_forest = mean_squared_error(y_test, y_rand_forest, squared=False)
print(f'{score_rand_forest:0.5f}')

0.70395


Улучшаем результат, используя градиентный бустинг на деревьях.
Взяв за основу параметры, полученные из модели Random Forest Regressor, анализируем остальние. Получаем ошибку 0.69991

In [8]:
#xgboost
model = xgb.XGBRegressor(colsample_bytree=0.7, learning_rate=0.1,max_depth=8, min_child_weight=310, n_estimators=160,n_jobs=-1, random_state=15, verbosity=0)
model.fit(X_train, y_train)
y = model.predict(X_test)
score = mean_squared_error(y_test, y, squared=False)
print(f'{score:0.5f}')    

0.69991


## Обучение на кросс-валидации

Чтобы точнее настроить модель, используем кросс-валидацию.

Для начала подготовим данные:

In [9]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
target = train['target']

features = [f'cont{x}'for x in range(1,15)]

train_data = train.drop(columns=['target'])
test_data = test

#Нормализация
train_data = train_data / train_data.aggregate("max")
test_data = test_data / test_data.aggregate("max")

Param - подобранные на предыдущем этапе наилучше параметры.

Получаем среднюю ошибку 0.6992320437452879

In [10]:
Param = {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 310,'n_estimators': 160, 'n_jobs': -1, 'random_state': 15, 'verbosity': 0}
train0 = train_data
test0 = test_data

preds = np.zeros(test0.shape[0])
mse=[]
k=0
kf = KFold(n_splits=5,random_state=15,shuffle=True)

for train_idx, test_idx in kf.split(train0[features],target):
    X_tr,X_val=train0[features].iloc[train_idx],train0[features].iloc[test_idx]
    y_tr,y_val=target.iloc[train_idx],target.iloc[test_idx]
    
    model = xgb.XGBRegressor(**Param)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    
    preds+=model.predict(test0[features])/kf.n_splits
    mse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(k+1,mse[k])
    k+=1
print(f"mean MSE for all the folds is {np.mean(mse)}")

1 0.6989296642150992
2 0.698089382035287
3 0.6981242224152149
4 0.7024195387631149
5 0.6985974112977235
mean MSE for all the folds is 0.6992320437452879


## Результат

In [11]:
submission['target'] = model.predict(test)

In [12]:
submission['target']

id
0         7.879364
2         7.862628
6         7.881279
7         8.064296
10        8.203677
            ...   
499984    8.166514
499985    8.191584
499987    8.098135
499988    8.107488
499990    7.946080
Name: target, Length: 200000, dtype: float32

In [13]:
submission.to_csv('res.csv')