### Загрузка библиотек и настроек

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from mpl_toolkits.mplot3d.axes3d import Axes3D
import pandas_summary as ps

%matplotlib inline
import matplotlib
import matplotlib.image as img
import matplotlib.pyplot as plt

import seaborn as sns
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize']=12,6

In [2]:
matplotlib.rcParams.update({'font.size': 14})
pd.set_option('precision', 3)
pd.set_option('max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 300)

In [3]:
def evaluate_preds(train_true_values, train_pred_values, val_true_values, val_pred_values):
    """
    Функция для оценки работы модели
    Parameters:
    train_true_values - целевая переменная из тренировочной части датасета
    train_pred_values - предсказания модели по тренировочной части
    val_true_values - целевая переменная из валидационной части датасета
    val_pred_values - предсказания модели по валидационной части
    Returns:
    R2 на тренировочной и валидационной части, 
    графики зависимости истинных значений от предсказаний
    """
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Valid R2:\t" + str(round(r2(val_true_values, val_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=val_pred_values, y=val_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')
    plt.show()

In [4]:
train_link = './data/train.csv'
test_link = './data/test.csv'

### Загрузка и обзор данных

**Описание датасета**

* **Id** - идентификационный номер квартиры
* **DistrictId** - идентификационный номер района
* **Rooms** - количество комнат
* **Square** - площадь
* **LifeSquare** - жилая площадь
* **KitchenSquare** - площадь кухни
* **Floor** - этаж
* **HouseFloor** - количество этажей в доме
* **HouseYear** - год постройки дома
* **Ecology_1, Ecology_2, Ecology_3** - экологические показатели местности
* **Social_1, Social_2, Social_3** - социальные показатели местности
* **Healthcare_1, Helthcare_2** - показатели местности, связанные с охраной здоровья
* **Shops_1, Shops_2** - показатели, связанные с наличием магазинов, торговых центров
* **Price** - цена квартиры

In [5]:
train_df = pd.read_csv(train_link, dtype ={
    'Id':'int32', 'DistrictId':'str','Rooms':'int16',
    'Square':'float32','LifeSquare':'float32','KitchenSquare':'float32',
    'Floor':'int8','HouseFloor':'int8','HouseYear':'int32',
    'Ecology_1':'float32','Ecology_2':'str','Ecology_3':'str',
    'Social_1':'int32','Social_2':'int64','Social_3':'int32',
    'Healthcare_1':'float64','Helthcare_2':'int32',
    'Shops_1':'int32','Shops_2':'str',
    'Price':'float32'}).set_index('Id')
test_df = pd.read_csv(test_link, dtype ={
    'Id':'int32', 'DistrictId':'str','Rooms':'int16',
    'Square':'float32','LifeSquare':'float32','KitchenSquare':'float32',
    'Floor':'int8','HouseFloor':'int8','HouseYear':'int32',
    'Ecology_1':'float32','Ecology_2':'str','Ecology_3':'str',
    'Social_1':'int32','Social_2':'int64','Social_3':'int32',
    'Healthcare_1':'float64','Helthcare_2':'int32',
    'Shops_1':'int32','Shops_2':'str'}).set_index('Id')

In [6]:
train_dfs = ps.DataFrameSummary(train_df)
train_dfs.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
DistrictId,10000,205,0,0%,categorical
Rooms,10000,9,0,0%,numeric
Square,10000,9995,0,0%,numeric
LifeSquare,7887,7886,2113,21.13%,numeric
KitchenSquare,10000,58,0,0%,numeric
Floor,10000,33,0,0%,numeric
HouseFloor,10000,44,0,0%,numeric
HouseYear,10000,97,0,0%,numeric
Ecology_1,10000,129,0,0%,numeric
Ecology_2,10000,2,0,0%,bool


In [7]:
test_dfs = ps.DataFrameSummary(test_df)
test_dfs.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
DistrictId,5000,201,0,0%,categorical
Rooms,5000,8,0,0%,numeric
Square,5000,4998,0,0%,numeric
LifeSquare,3959,3958,1041,20.82%,numeric
KitchenSquare,5000,38,0,0%,numeric
Floor,5000,35,0,0%,numeric
HouseFloor,5000,41,0,0%,numeric
HouseYear,5000,97,0,0%,numeric
Ecology_1,5000,130,0,0%,numeric
Ecology_2,5000,2,0,0%,bool


In [8]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,10000.0,1.8905,0.83951,0.0,1.0,2.0,2.0,19.0
Square,10000.0,56.31578,21.05873,1.13686,41.77488,52.51331,65.90063,641.06519
LifeSquare,7887.0,37.19965,86.24121,0.37062,22.76983,32.78126,45.1288,7480.59229
KitchenSquare,10000.0,6.2733,28.56092,0.0,1.0,6.0,9.0,2014.0
Floor,10000.0,8.5267,5.24115,1.0,4.0,7.0,12.0,42.0
HouseFloor,10000.0,12.6094,6.77597,0.0,9.0,13.0,17.0,117.0
HouseYear,10000.0,3990.1663,200500.26143,1910.0,1974.0,1977.0,2001.0,20052011.0
Ecology_1,10000.0,0.11886,0.11903,0.0,0.01765,0.07542,0.19578,0.52187
Social_1,10000.0,24.687,17.53261,0.0,6.0,25.0,36.0,74.0
Social_2,10000.0,5352.1574,4006.7998,168.0,1564.0,5285.0,7227.0,19083.0


In [9]:
test_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,5000.0,1.91,0.83859,0.0,1.0,2.0,2.0,17.0
Square,5000.0,56.4495,19.09279,1.37854,41.90623,52.92134,66.28513,223.45369
LifeSquare,3959.0,36.15881,17.82529,0.33349,23.09203,32.92509,45.17409,303.07111
KitchenSquare,5000.0,5.9768,9.95002,0.0,1.0,6.0,9.0,620.0
Floor,5000.0,8.632,5.48323,1.0,4.0,7.0,12.0,78.0
HouseFloor,5000.0,12.601,6.78921,0.0,9.0,12.0,17.0,99.0
HouseYear,5000.0,1984.3926,18.57315,1908.0,1973.0,1977.0,2000.0,2020.0
Ecology_1,5000.0,0.11987,0.12007,0.0,0.01951,0.07216,0.19578,0.52187
Social_1,5000.0,24.9338,17.5322,0.0,6.0,25.0,36.0,74.0
Social_2,5000.0,5406.9,4026.61477,168.0,1564.0,5285.0,7287.0,19083.0


In [10]:
train_df.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
DistrictId,10000,205,27,851
Ecology_2,10000,2,B,9903
Ecology_3,10000,2,B,9725
Shops_2,10000,2,B,9175


In [11]:
test_df.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
DistrictId,5000,201,27,391
Ecology_2,5000,2,B,4952
Ecology_3,5000,2,B,4851
Shops_2,5000,2,B,4588


### Очистка и обработка данных

In [12]:
train_df['DistrictId'] = train_df['DistrictId'].astype(str)
test_df['DistrictId'] = test_df['DistrictId'].astype(str)

In [13]:
del train_df['Healthcare_1']
del test_df['Healthcare_1']

In [14]:
train_df.loc[(train_df['Rooms']==0), 'Rooms']=1
train_df.loc[(train_df['Rooms']>7), 'Rooms']=7
test_df.loc[(test_df['Rooms']==0), 'Rooms']=1
test_df.loc[(test_df['Rooms']>7), 'Rooms']=7

In [15]:
train_df.loc[(train_df['HouseYear']==4968), 'HouseYear']=1968
train_df.loc[(train_df['HouseYear']==20052011), 'HouseYear']=2011
train_df.loc[(train_df['HouseYear']<1800), 'HouseYear']=train_df['HouseYear'].median()
test_df.loc[(test_df['HouseYear']==4968), 'HouseYear']=1968
test_df.loc[(test_df['HouseYear']==20052011), 'HouseYear']=2011
test_df.loc[(test_df['HouseYear']<1800), 'HouseYear']=test_df['HouseYear'].median()

In [16]:
train_df['HouseAge'] = 2020 - train_df['HouseYear']
test_df['HouseAge'] = 2020 - test_df['HouseYear']

In [17]:
train_df.loc[(train_df['Floor']>150),'Floor']=train_df['Floor'].median()
test_df.loc[(test_df['Floor']>150),'Floor']=test_df['Floor'].median()

In [18]:
train_df.loc[(train_df['HouseFloor']<1)]=1
#train_df.loc[(train_df['HouseFloor']<train_df['Floor']), 'HouseFloor']=train_df.loc[(train_df['HouseFloor']<train_df['Floor']), 'Floor']
train_df['HouseFloor'] = np.where(train_df['HouseFloor'] < train_df['Floor'], train_df['Floor'], train_df['HouseFloor'])
test_df.loc[(test_df['HouseFloor']<1)]=1
#test_df.loc[(test_df['HouseFloor']<test_df['Floor']), 'HouseFloor']=test_df.loc[(test_df['HouseFloor']<test_df['Floor']), 'Floor']
test_df['HouseFloor'] = np.where(test_df['HouseFloor'] < test_df['Floor'], test_df['Floor'], test_df['HouseFloor'])

In [19]:
train_df.loc[train_df['Square']>train_df['Square'].quantile(0.999),'Square'] = train_df['Square'].quantile(0.999)
train_df.loc[train_df['Square']<train_df['Square'].quantile(0.01),'Square'] = train_df['Square'].quantile(0.01)
test_df.loc[test_df['Square']>test_df['Square'].quantile(0.999),'Square'] = test_df['Square'].quantile(0.999)
test_df.loc[test_df['Square']<test_df['Square'].quantile(0.01),'Square'] = test_df['Square'].quantile(0.01)

In [20]:
train_df.loc[train_df['KitchenSquare']>train_df['KitchenSquare'].quantile(0.999),'KitchenSquare'] = train_df['KitchenSquare'].quantile(0.999)
train_df.loc[train_df['KitchenSquare']<train_df['KitchenSquare'].quantile(0.01),'KitchenSquare'] = train_df['KitchenSquare'].quantile(0.01)
test_df.loc[test_df['KitchenSquare']>test_df['KitchenSquare'].quantile(0.999),'KitchenSquare'] = test_df['KitchenSquare'].quantile(0.999)
test_df.loc[test_df['KitchenSquare']<test_df['KitchenSquare'].quantile(0.01),'KitchenSquare'] = test_df['KitchenSquare'].quantile(0.01)

In [21]:
train_df['KitchenSquare'] = np.where(train_df['Square'] < train_df['KitchenSquare'], train_df['Square']*0.5, train_df['KitchenSquare'])
test_df['KitchenSquare'] = np.where(test_df['Square'] < test_df['KitchenSquare'], test_df['Square']*0.5, test_df['KitchenSquare'])

In [22]:
train_df.loc[train_df['LifeSquare']>train_df['LifeSquare'].quantile(0.999),'LifeSquare'] = train_df['LifeSquare'].quantile(0.999)
train_df.loc[train_df['LifeSquare']<train_df['LifeSquare'].quantile(0.01),'LifeSquare'] = train_df['LifeSquare'].quantile(0.01)
test_df.loc[test_df['LifeSquare']>test_df['LifeSquare'].quantile(0.999),'LifeSquare'] = test_df['LifeSquare'].quantile(0.999)
test_df.loc[test_df['LifeSquare']<test_df['LifeSquare'].quantile(0.01),'LifeSquare'] = test_df['LifeSquare'].quantile(0.01)

In [23]:
train_df['LifeSquare'] = np.where(train_df['Square'] < train_df['LifeSquare'], train_df['Square']*0.9, train_df['LifeSquare'])
test_df['LifeSquare'] = np.where(test_df['Square'] < test_df['LifeSquare'], test_df['Square']*0.9, test_df['LifeSquare'])

In [24]:
train_df['Life_share'] = train_df['LifeSquare'] / train_df['Square']
test_df['Life_share'] = test_df['LifeSquare'] / test_df['Square']
train_df.loc[(train_df['Life_share'].isnull()),'Life_share'] = train_df['Life_share'].median()
test_df.loc[(test_df['Life_share'].isnull()),'Life_share'] = test_df['Life_share'].median()

In [25]:
avg_price = train_df.groupby(['DistrictId'], as_index=False).agg({'Square':'sum','Price':'sum'})
avg_price['avg_price_meter'] = avg_price['Price'] / avg_price['Square']
avg_price = avg_price[['DistrictId','avg_price_meter']]

In [26]:
train_df = train_df.reset_index().merge(avg_price, on='DistrictId', how='left').set_index('Id')
train_df['avg_price_f'] = train_df['Square'] * train_df['avg_price_meter']
test_df = test_df.reset_index().merge(avg_price, on='DistrictId', how='left',left_index=True).set_index('Id')
test_df['avg_price_f'] = test_df['Square'] * test_df['avg_price_meter']
test_df.loc[(test_df['avg_price_f'].isnull())]=test_df['avg_price_f'].mean()

In [27]:
train_df['Ecology_2'] = train_df['Ecology_2'].replace({'A':0, 'B':1})
train_df['Ecology_3'] = train_df['Ecology_3'].replace({'A':0, 'B':1})
train_df['Shops_2'] = train_df['Shops_2'].replace({'A':0, 'B':1})
test_df['Ecology_2'] = test_df['Ecology_2'].replace({'A':0, 'B':1})
test_df['Ecology_3'] = test_df['Ecology_3'].replace({'A':0, 'B':1})
test_df['Shops_2'] = test_df['Shops_2'].replace({'A':0, 'B':1})

In [28]:
train_add = train_df[['Ecology_1','Ecology_2','Ecology_3','Social_1','Social_2','Social_3','Helthcare_2','Shops_1','Shops_2']]
test_add = test_df[['Ecology_1','Ecology_2','Ecology_3','Social_1','Social_2','Social_3','Helthcare_2','Shops_1','Shops_2']]
pca = PCA(n_components=3)
X_train_add = pd.DataFrame(pca.fit_transform(train_add), columns=['add1','add2','add3'],index=train_add.index)
X_test_add = pd.DataFrame(pca.transform(test_add), columns=['add1','add2','add3'],index=test_add.index)

In [29]:
#train_df['Rooms'].value_counts()
#test_df['Rooms'].value_counts()
#train_df['HouseYear'].unique()
#train_df['HouseYear'].value_counts()
#test_df['HouseYear'].unique()
#test_df['HouseYear'].value_counts()
#train_df['Floor'].unique()
#train_df['Floor'].value_counts()
#test_df['Floor'].unique()
#test_df['Floor'].value_counts()
#np.sort(train_df['HouseFloor'].unique())
#train_df['HouseFloor'].value_counts().sort_index()
#np.sort(test_df['HouseFloor'].unique())
#test_df['HouseFloor'].value_counts().sort_index()
#train_df['DistrictId'].value_counts()
#test_df['DistrictId'].value_counts()

In [30]:
y = train_df['Price']
X_train = train_df[['Rooms','Square','avg_price_f','Life_share','KitchenSquare','HouseYear','Floor','HouseFloor']]
X_test = test_df[['Rooms','Square','avg_price_f','Life_share','KitchenSquare','HouseYear','Floor','HouseFloor']]

In [31]:
X_train2 = X_train.join(X_train_add)
X_test2 = X_test.join(X_test_add)

### Моделирование

In [56]:
X_train21, X_val21, y_train21, y_val21 = train_test_split(X_train2, y, test_size=0.3, shuffle=True, random_state=42)

In [57]:
rf_model = RandomForestRegressor(random_state = 21)
rf_model.fit(X_train21, y_train21)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=21, verbose=0, warm_start=False)

In [58]:
y_pred_tr_rf21 = rf_model.predict(X_train21)
r2(y_train21, y_pred_tr_rf21)

0.968159709764196

In [59]:
y_pred_val_rf21 = rf_model.predict(X_val21)
r2(y_val21, y_pred_val_rf21)

0.7636711668290047

In [60]:
y_test_preds_rf21 = rf_model.predict(X_test2)

In [61]:
test_data_rf21 = pd.DataFrame(y_test_preds_rf21, columns=['Price'],index=X_test2.index)
test_data_rf21.head()

Unnamed: 0_level_0,Price
Id,Unnamed: 1_level_1
725,162838.23227
15856,234814.615
5480,132872.85539
15664,298892.03219
14275,134704.78164


In [62]:
test_data_rf21.to_csv('prediction_rf21.csv', index=True)

In [67]:
ln_model21 = LinearRegression()

In [69]:
ln_model21.fit(X_train21, y_train21)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [71]:
y_pred_tr_ln21 = ln_model21.predict(X_train21)
r2(y_train21, y_pred_tr_ln21)

0.751918656994333

In [72]:
y_test_preds_ln21 = ln_model21.predict(X_test2)

In [73]:
test_data_ln21 = pd.DataFrame(y_test_preds_ln21, columns=['Price'],index=X_test2.index)
test_data_ln21.to_csv('prediction_ln21.csv', index=True)

In [74]:
model_gb21 = GradientBoostingRegressor(n_estimators=109,max_depth=5,random_state=2,max_features=6)

In [75]:
model_gb21.fit(X_train21, y_train21)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=5,
                          max_features=6, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=109,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=2, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [76]:
print(model_gb21.score(X_train21,y_train21))
print(model_gb21.score(X_val21,y_val21))

0.8882728084876189
0.7729110554182479


In [78]:
predict_result_gb21 = pd.DataFrame(model_gb21.predict(X_test2), index=X_test2.index,columns=['Price'])

In [80]:
predict_result_gb21.to_csv('prediction_gb21.csv', index=True)

In [42]:
test_df.head()

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,HouseAge,Life_share,avg_price_meter,avg_price_f
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
725,58,2.0,49.88264,33.43278,6.0,6.0,14.0,1972.0,0.3102,1.0,1.0,11.0,2748.0,1.0,0.0,0.0,1.0,48.0,0.67023,2971.35156,148218.875
15856,74,2.0,69.26318,,1.0,6.0,6.0,1977.0,0.07578,1.0,1.0,6.0,1437.0,3.0,0.0,2.0,1.0,43.0,0.62352,3276.81494,226962.64062
5480,190,1.0,13.59782,12.23804,12.0,2.0,5.0,1909.0,0.0,1.0,1.0,30.0,7538.0,87.0,5.0,5.0,1.0,111.0,0.9,7735.22412,105182.17188
15664,47,2.0,73.04661,51.94084,9.0,22.0,22.0,2007.0,0.10187,1.0,1.0,23.0,4583.0,3.0,3.0,3.0,1.0,13.0,0.71106,4063.43896,296820.4375
14275,27,1.0,47.52711,43.38757,1.0,17.0,17.0,2017.0,0.07216,1.0,1.0,2.0,629.0,1.0,0.0,0.0,0.0,3.0,0.9129,2713.92017,128984.78125


In [32]:
X_train2_dfs = ps.DataFrameSummary(X_train2)
X_train2_dfs.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
Rooms,10000,7,0,0%,numeric
Square,10000,9718,0,0%,numeric
avg_price_f,10000,9727,0,0%,numeric
Life_share,10000,7368,0,0%,numeric
KitchenSquare,10000,54,0,0%,numeric
HouseYear,10000,96,0,0%,numeric
Floor,10000,33,0,0%,numeric
HouseFloor,10000,43,0,0%,numeric
add1,10000,144,0,0%,numeric
add2,10000,145,0,0%,numeric


In [33]:
X_test2_dfs = ps.DataFrameSummary(X_test2)
X_test2_dfs.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
Rooms,5000,8,0,0%,numeric
Square,5000,4858,0,0%,numeric
avg_price_f,5000,4861,0,0%,numeric
Life_share,5000,3697,0,0%,numeric
KitchenSquare,5000,37,0,0%,numeric
HouseYear,5000,98,0,0%,numeric
Floor,5000,36,0,0%,numeric
HouseFloor,5000,42,0,0%,numeric
add1,5000,145,0,0%,numeric
add2,5000,145,0,0%,numeric
