# Moduls

In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 100

In [83]:
data = pd.read_csv('D:\Kurs_project_task/train.csv')

In [84]:
# data.shape

In [85]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### train_test_split

In [86]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [87]:
# train.shape, valid.shape

# Prepare train data

In [88]:
distr_info1 = train['DistrictId'].value_counts(normalize=True).reset_index().rename(columns={'index':'DistrictId', 
                                                                               'DistrictId':'flat_qty_distr'})

In [89]:
distr_stat_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().\
    rename(columns={'Price':'mean_price_dr'})  
distr_stat_r = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})

mean_price = train['Price'].mean()

In [90]:
#Заполняем пустую жилую площадь
train['LifeSquare'] = train['LifeSquare'].fillna(train['Square'])
#Заполняем площадь если общая меньше жилой
train.loc[(train['Square'] < train['LifeSquare']), 'LifeSquare'] = train['LifeSquare']
train['Diff_square'] = train['Square'] - train['LifeSquare']
mean_square = train.loc[(train['Diff_square'] > 4)].groupby('Rooms', as_index=False)[['Square','LifeSquare', 'Diff_square']].mean().rename(columns={'Square': 'Mean_square','LifeSquare': 'Mean_LifeSquare','Diff_square': 'Mean_Diff_square'})
mean_sqr = mean_square.loc[(mean_square['Rooms'] != 0) & (mean_square['Rooms'] < 6)].sort_values(by='Rooms')


In [91]:
def add_district_info1(df, distr_info1):
    df = pd.merge(df, distr_info1, on='DistrictId', how='left')
    df['flat_qty_distr'] = df['flat_qty_distr'].fillna(0.000143)
    return df

In [92]:
def add_stats(df, distr_stat_dr, distr_stat_r, mean_price):
    df = pd.merge(df, distr_stat_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, distr_stat_r, on='Rooms', how='left')
    
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [93]:
def add_cat_fts(df, cat_fts=('Ecology_2', 'Ecology_3', 'Shops_2')):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df

In [94]:
# def fillna_healthcare_1(df):
#     df['Healthcare_1'] = df['Healthcare_1'].fillna(0)
#     return df

In [95]:
def drop_columns(df, col):
    return df.drop(col, axis=1)

In [96]:
def house_period_type(df):
#     df['HouseYearType'] = 0
    df.loc[(df['HouseYear'] < 1990), 'HouseYearType'] = 0    
    df.loc[(df['HouseYear'] >= 1990) & (df['HouseYear'] < 2000), 'HouseYearType'] = 1
    df.loc[(df['HouseYear'] >= 2000) & (df['HouseYear'] < 2018), 'HouseYearType'] = 2
    df.loc[(df['HouseYear'] >= 2018), 'HouseYearType'] = 3  
    return df

In [97]:
def flat_floor_type(df):
#     df['FlatFloorType'] = 0
    df.loc[(df['Floor'] <= 2), 'FlatFloorType'] = 0    
    df.loc[(df['Floor'] > 2) & (df['Floor'] <= 9), 'FlatFloorType'] = 1
    df.loc[(df['Floor'] > 9), 'FlatFloorType'] = 2  
    return df

In [98]:
def bilding_floor_type(df):
#     df['FloorType'] = 0
    df.loc[df['HouseFloor'] <= 5, 'FloorType'] = 0    
    df.loc[(df['HouseFloor'] > 5) & (df['HouseFloor'] <= 9), 'FloorType'] = 1
    df.loc[(df['HouseFloor'] > 9) & (df['HouseFloor'] <= 17), 'FloorType'] = 2
    df.loc[df['HouseFloor'] > 17, 'FloorType'] = 3
    return df

In [99]:
def fill_life_square(df):
    #Заполняем пустую жилую площадь
    df['Square'] = df['Square'].fillna(df['Mean_square'])
    df['LifeSquare'] = df['LifeSquare'].fillna(df['Square'])
    #Заполняем площадь если общая меньше жилой
    df.loc[(df['Square'] < df['LifeSquare']), 'LifeSquare'] = df['LifeSquare']
    

In [100]:
def fillrooms_bysize(df, m_square):
    for idx, row in df.loc[(df['Rooms'] > 5) | (df['Rooms'] < 1)].iterrows():
        min_m = m_square.loc[m_square['Mean_square'] < row['Square'], 'Rooms']
        if len(min_m.index) == 0:
            df.loc[df['Id'] == row['Id'], 'Rooms'] = m_square.iloc[0,0]
        else:
            df.loc[data['Id'] == row['Id'], 'Rooms'] = min_m.index[-1]

    df = pd.merge(df, m_square, on='Rooms', how='left')
    return df

In [101]:
def prepare_data(df, distr_info1, distr_stat_dr, distr_stat_r, mean_price, mean_square):
# def prepare_data(df, distr_info1, distr_stat_dr, distr_stat_r, mean_price):
    df = fillrooms_bysize(df, mean_square)
    df = house_period_type(df)
    df = flat_floor_type(df)
    df = bilding_floor_type(df)
    df = drop_columns(df, ['Healthcare_1'])
#     df = fill_life_square(df)
    
    df = add_district_info1(df, distr_info1)
    df = add_stats(df, distr_stat_dr, distr_stat_r, mean_price)
    df = add_cat_fts(df)

    return df

In [102]:
train.info
train = prepare_data(train, distr_info1, distr_stat_dr, distr_stat_r, mean_price, mean_sqr)
# train = prepare_data(train, distr_info1, distr_stat_dr, distr_stat_r, mean_price)

In [103]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price,Diff_square,Mean_square,Mean_LifeSquare,Mean_Diff_square,HouseYearType,FlatFloorType,FloorType,flat_qty_distr,mean_price_dr,mean_price_r
0,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,1,1,6,1437,3,0,2,1,88504.384965,18.885214,39.768384,21.272841,18.495543,2.0,2.0,2.0,0.056286,102427.030975,160134.810901
1,5621,23,3.0,163.495333,161.504222,12.0,5,3.0,1977,0.014073,1,1,2,475,0,0,0,1,207007.956663,1.991111,74.017092,46.37392,27.643172,0.0,1.0,0.0,0.056286,165911.1297,290867.452543
2,235,87,1.0,39.710131,19.538663,8.0,4,17.0,1986,0.100456,1,1,43,7227,0,1,6,0,182126.280899,20.171468,39.768384,21.272841,18.495543,0.0,1.0,2.0,0.003,169596.630515,160134.810901
3,16258,48,3.0,96.056784,98.152802,1.0,15,1.0,2017,0.041125,1,1,46,9515,5,1,10,1,524365.550705,-2.096018,74.017092,46.37392,27.643172,2.0,2.0,0.0,0.008857,382424.639356,290867.452543
4,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,1,1,16,4048,3,1,3,1,322048.43399,34.774763,74.017092,46.37392,27.643172,0.0,2.0,2.0,0.004,251751.766701,290867.452543


In [104]:
# train.info()

In [105]:
valid = prepare_data(valid, distr_info1, distr_stat_dr, distr_stat_r, mean_price, mean_sqr)
# valid = prepare_data(valid, distr_info1, distr_stat_dr, distr_stat_r, mean_price)

### Model

In [106]:
# train.columns

In [107]:
# feats = ['Rooms', 'Square', 'flat_qty_distr', 'mean_price_dr', 'Helthcare_2', 'Healthcare_1', 'HouseYear', 'HouseFloor']
# feats = ['HouseYear', 'Rooms', 'Square',  'HouseFloor', 'Floor', 'flat_qty_distr','Shops_1', 'Shops_2', 'Helthcare_2']
# feats = ['Shops_2', 'Price']
feats = ['FlatFloorType', 'HouseYearType','FloorType', 'Rooms','Square','flat_qty_distr']#, 'LifeSquare']

In [108]:
# model = RF(n_estimators=300, max_depth=16, random_state=42, max_features=4, min_samples_leaf=2) 70.7
model = RF(n_estimators=300, max_depth=13, random_state=42, max_features=2, min_samples_leaf=1)

In [109]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=13,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [110]:
pred_train = model.predict(train.loc[:, feats])

In [111]:
# pred_train.shape

In [112]:
# pred_train

In [113]:
pred_valid = model.predict(valid.loc[:, feats])

In [114]:
# pred_valid.shape

In [115]:
# pred_valid

## Find trates

### Evaluate model

In [116]:
r2(train['Price'], pred_train)

0.8792479562430546

In [117]:
r2(valid['Price'], pred_valid)

0.6629315874029599

### Test

In [118]:
# ?prepare_data

In [119]:
test = pd.read_csv('D:\Kurs_project_task/test.csv')

In [120]:
# test = prepare_data(test, distr_info1, distr_stat_dr, distr_stat_r, mean_price)
test = prepare_data(test, distr_info1, distr_stat_dr, distr_stat_r, mean_price, mean_sqr)

In [121]:
test['Price'] = model.predict(test.loc[:, feats])

In [122]:
test.loc[:, ['Id', 'Price']].to_csv('PSubbotin_predictions.csv', index=None)