In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, make_scorer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import minmax_scale
from sklearn.feature_selection import RFE
from math import pi, floor

%matplotlib inline

### Выгружаем базу

In [None]:
train = pd.read_csv('train.csv')
train = pd.concat([train[train.price <=math.exp(13.81)],train[train.price>=math.exp(13.82)]])
test = pd.read_csv('test.csv')
train_price = train[['price']].reset_index(drop=True)
idt = test[['id']]
df = pd.concat([train.iloc[:,:-1],test], ignore_index = True)
del df['id']

### Разбиваем дату

In [None]:
month = pd.DataFrame([month.split('-')[1] for month in df.date], columns = ['month']).astype(int)
del df['date']
df = pd.concat([month,df.iloc[:,0:]], axis = 1)

In [None]:
df.area = np.log(np.log(df.area))

### Заменяем NaN

In [None]:
df['build_tech'] = df['build_tech'].replace(np.nan,df.build_tech.median())
df['metro_dist'] = df['metro_dist'].replace(np.nan,df.metro_dist.mean())
df['g_lift'] = df['g_lift'].replace(np.nan,df.g_lift.median())

### One hot encoding и масштабирование

In [None]:
names = ['area','rooms','balcon','metro_dist']
df[names] = minmax_scale(df[names].astype(float))
df = pd.concat([pd.get_dummies(df['g_lift'], prefix = 'g_lift'),df], axis = 1)
del df['g_lift']
df = pd.concat([pd.get_dummies(df['street_id'], prefix = 'street_id'),df], axis = 1)
del df['street_id']
df = pd.concat([pd.get_dummies(df['floor'], prefix = 'floor'),df], axis = 1)
del df['floor']
df = pd.concat([pd.get_dummies(df['build_tech'], prefix = 'build_tech'),df], axis = 1)
del df['build_tech']
df.month.replace(range(1,13),[0,0,1,1,1,2,2,2,3,3,3,0], inplace=True)
df = pd.concat([pd.get_dummies(df['month'], prefix = 'month'),df], axis = 1)
del df['month']
df.n_photos.replace(range(0,12),[0,0,0,0,1,1,1,1,2,2,2,2], inplace=True)
df = pd.concat([pd.get_dummies(df['n_photos'], prefix = 'n_photos'),df], axis = 1)
del df['n_photos']

### Формируем train и test

In [None]:
train = pd.concat([df.iloc[:96041,:],train_price], axis = 1)
test = df.iloc[96041:,:]
k=0.8
X_train, X_test, y_train, y_test = train.iloc[:floor(k*96041),:-1], train.iloc[floor(k*96041):,:-1], train.iloc[:floor(k*96041),-1:], train.iloc[floor(k*96041):,-1:]

In [None]:
gbr = GradientBoostingRegressor(loss = 'huber', subsample = 0.9, learning_rate = 0.01, max_depth = 3, n_estimators = 2000, verbose = 1, random_state = 7)
gbr.fit(X_train,np.ravel(y_train))
print('Gradient Boosting Regressor MAE = ' + str(mae(gbr.predict(X_test),y_test)))

### Финальное предсказание

In [None]:
X_train, y_train = train.iloc[:,:-1], train.iloc[:,-1]
gbr = GradientBoostingRegressor(loss = 'huber', subsample = 0.9, learning_rate = 0.01, max_depth = 6, n_estimators = 5000, verbose = 1, random_state = 7)
gbr.fit(X_train,np.ravel(y_train))
pred = pd.DataFrame(gbr.predict(test), columns = ['price'])
#idt = pd.DataFrame(idt)
subm = pd.concat([idt,pred], axis = 1)
subm.to_csv('submit.csv', index = False)