### Загрузка библиотек и настроек

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from mpl_toolkits.mplot3d.axes3d import Axes3D
import pandas_summary as ps

%matplotlib inline
import matplotlib
import matplotlib.image as img
import matplotlib.pyplot as plt

import seaborn as sns
%config InlineBackend.figure_format = 'svg'

from pylab import rcParams
rcParams['figure.figsize']=12,6

In [2]:
matplotlib.rcParams.update({'font.size': 14})
pd.set_option('precision', 3)
pd.set_option('max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('max_colwidth', 300)

In [3]:
def evaluate_preds(train_true_values, train_pred_values, val_true_values, val_pred_values):
    """
    Функция для оценки работы модели
    Parameters:
    train_true_values - целевая переменная из тренировочной части датасета
    train_pred_values - предсказания модели по тренировочной части
    val_true_values - целевая переменная из валидационной части датасета
    val_pred_values - предсказания модели по валидационной части
    Returns:
    R2 на тренировочной и валидационной части, 
    графики зависимости истинных значений от предсказаний
    """
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Valid R2:\t" + str(round(r2(val_true_values, val_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=val_pred_values, y=val_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')
    plt.show()

In [4]:
train_link = './data/train.csv'
test_link = './data/test.csv'

### Загрузка и обзор данных

**Описание датасета**

* **Id** - идентификационный номер квартиры
* **DistrictId** - идентификационный номер района
* **Rooms** - количество комнат
* **Square** - площадь
* **LifeSquare** - жилая площадь
* **KitchenSquare** - площадь кухни
* **Floor** - этаж
* **HouseFloor** - количество этажей в доме
* **HouseYear** - год постройки дома
* **Ecology_1, Ecology_2, Ecology_3** - экологические показатели местности
* **Social_1, Social_2, Social_3** - социальные показатели местности
* **Healthcare_1, Helthcare_2** - показатели местности, связанные с охраной здоровья
* **Shops_1, Shops_2** - показатели, связанные с наличием магазинов, торговых центров
* **Price** - цена квартиры

In [16]:
train_df = pd.read_csv(train_link).set_index('Id')
test_df = pd.read_csv(test_link).set_index('Id')

In [48]:
train_dfs = ps.DataFrameSummary(train_df)
train_dfs.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
DistrictId,9989,205,0,0%,categorical
Rooms,9989,6,0,0%,numeric
Square,9989,9989,0,0%,numeric
LifeSquare,7879,7879,2110,21.12%,numeric
KitchenSquare,9989,58,0,0%,numeric
Floor,9989,33,0,0%,numeric
HouseFloor,9989,44,0,0%,numeric
HouseYear,9989,97,0,0%,numeric
Ecology_1,9989,129,0,0%,numeric
Ecology_2,9989,2,0,0%,bool


In [49]:
test_dfs = ps.DataFrameSummary(test_df)
test_dfs.columns_stats.T

Unnamed: 0,counts,uniques,missing,missing_perc,types
DistrictId,5000,201,0,0%,categorical
Rooms,5000,8,0,0%,numeric
Square,5000,5000,0,0%,numeric
LifeSquare,3959,3959,1041,20.82%,numeric
KitchenSquare,5000,38,0,0%,numeric
Floor,5000,35,0,0%,numeric
HouseFloor,5000,41,0,0%,numeric
HouseYear,5000,97,0,0%,numeric
Ecology_1,5000,130,0,0%,numeric
Ecology_2,5000,2,0,0%,bool


In [50]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,9989.0,1.88868,0.81254,1.0,1.0,2.0,2.0,6.0
Square,9989.0,56.29621,20.98317,1.13686,41.77318,52.51277,65.89258,641.06519
LifeSquare,7879.0,37.16771,86.25241,0.37062,22.77496,32.77243,45.12502,7480.59213
KitchenSquare,9989.0,6.2766,28.57609,0.0,1.0,6.0,9.0,2014.0
Floor,9989.0,8.52648,5.23914,1.0,4.0,7.0,12.0,42.0
HouseFloor,9989.0,12.61548,6.77189,0.0,9.0,13.0,17.0,117.0
HouseYear,9989.0,3992.364,200610.62773,1910.0,1974.0,1977.0,2001.0,20052011.0
Ecology_1,9989.0,0.11884,0.11906,0.0,0.01765,0.07542,0.19578,0.52187
Social_1,9989.0,24.69236,17.52746,0.0,6.0,25.0,36.0,74.0
Social_2,9989.0,5353.002,4005.2088,168.0,1564.0,5285.0,7227.0,19083.0


In [51]:
test_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,5000.0,1.91,0.83859,0.0,1.0,2.0,2.0,17.0
Square,5000.0,56.4495,19.09279,1.37854,41.90623,52.92134,66.28513,223.45369
LifeSquare,3959.0,36.15881,17.82529,0.33349,23.09203,32.92509,45.17409,303.07109
KitchenSquare,5000.0,5.9768,9.95002,0.0,1.0,6.0,9.0,620.0
Floor,5000.0,8.632,5.48323,1.0,4.0,7.0,12.0,78.0
HouseFloor,5000.0,12.601,6.78921,0.0,9.0,12.0,17.0,99.0
HouseYear,5000.0,1984.3926,18.57315,1908.0,1973.0,1977.0,2000.0,2020.0
Ecology_1,5000.0,0.11987,0.12007,0.0,0.01951,0.07216,0.19578,0.52187
Social_1,5000.0,24.9338,17.5322,0.0,6.0,25.0,36.0,74.0
Social_2,5000.0,5406.9,4026.61477,168.0,1564.0,5285.0,7287.0,19083.0


In [52]:
train_df.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
DistrictId,9989,205,27,846
Ecology_2,9989,2,B,9892
Ecology_3,9989,2,B,9714
Shops_2,9989,2,B,9164


In [53]:
test_df.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
DistrictId,5000,201,27,391
Ecology_2,5000,2,B,4952
Ecology_3,5000,2,B,4851
Shops_2,5000,2,B,4588


### Очистка и обработка данных

In [21]:
train_df['DistrictId'] = train_df['DistrictId'].astype(str)
test_df['DistrictId'] = test_df['DistrictId'].astype(str)

In [39]:
del train_df['Healthcare_1']
del test_df['Healthcare_1']

In [47]:
train_df = train_df.drop(train_df[(train_df.Rooms.isin([0, 10, 19]))].index)

In [54]:
train_df['Rooms'].value_counts()

2.00000    3880
1.00000    3705
3.00000    2235
4.00000     150
5.00000      18
6.00000       1
Name: Rooms, dtype: int64

In [55]:
test_df['Rooms'].value_counts()

2.00000     2030
1.00000     1769
3.00000     1099
4.00000       90
5.00000        7
0.00000        2
6.00000        2
17.00000       1
Name: Rooms, dtype: int64