In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/sample_submission.csv', sep=',')
dftr = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/train.csv', sep=',')
dfts = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/test.csv', sep=',')
df.tail()
dftr.tail()
dfts.head()

In [3]:
print(dftr.shape)
print(dftr.dtypes)
print(dfts.shape)
print(dfts.dtypes)

In [4]:
dftr['Id'] = dftr['Id'].astype(str)
dftr['DistrictId'] = dftr['DistrictId'].astype(str)
dfts['Id'] = dfts['Id'].astype(str)
dfts['DistrictId'] = dfts['DistrictId'].astype(str)

In [5]:
dftr['Price'].hist(density=False, bins=30)
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

# Распределение цен показывает, что в наибольшем количестве а датавете представлены квартиры
# в ценовом пределе 200 000 (примерно 1 300 единиц), количество квартир большей стоимости
# убывает по экспоненте, меньшей - почти линейно.

In [6]:
print(dftr.describe())

# Описание признаков показывает наличие пропусков в признаках LifeSquare, HouseYear и
# Healthcare_1, а также выбросов в признаках HouseYear, Ecology_1, Social_3, Healthcare_1,
# Helthcare_2, Shops_1

In [7]:
dftr['HouseYear'].hist(density=False, bins=10)
plt.yscale(value='log')
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

dftr['HouseYear'].value_counts()
# Признаки 4968 и 20052011 явно выбросные и подлежат удалению

In [8]:
dftr['Ecology_1'].hist(density=False, bins=10)
#plt.yscale(value='log')
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

dftr['Ecology_1'].value_counts()
# Все нормально

In [9]:
dftr['Social_3'].hist(density=False, bins=40)
plt.yscale(value='log')
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

dftr['Social_3'].value_counts()
# В данном случае получается, что чем дальше признак от среднего значения, тем более он
# распространен, и гистограмма похожа на вогнутый колокол

In [10]:
dftr['Healthcare_1'].hist(density=False, bins=40)
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

dftr['Healthcare_1'].value_counts()
# Ничего необычного

In [11]:
dftr['Helthcare_2'].hist(density=False, bins=7)
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

dftr['Helthcare_2'].value_counts()

# 

In [12]:
dftr['Shops_1'].hist(density=False, bins=19)
plt.xlabel('prices')
plt.ylabel('count')
plt.show()

dftr['Shops_1'].value_counts()



In [13]:
# Заменим выбросные значения последним возможным годом постройки
dftr.loc[(dftr['HouseYear'] > 2020), 'HouseYear'] = 2020
dftr['HouseYear'].value_counts()

In [14]:
# Неадекватных значений больше не наблюдается
dftr['HouseYear'].hist(density=False, bins=100)
plt.yscale(value='log')
plt.xlabel('prices')
plt.ylabel('count')
plt.xlim(1800, 2020)
plt.show()

In [15]:
dftr.isna().sum()
# Большое количество пропусков наблюдается в графах LifeSquare и Healthcare_1

In [16]:
# Заменяем все пропуски на разность общей площади и площади кухни
dftr.loc[dftr['LifeSquare'].isna(),'LifeSquare'] = dftr['Square'] - dftr['KitchenSquare']
dftr.isna().sum()
# Все пропуски были заменены

In [17]:
# Заменяем пропуски в графе Healthcare_1 на медианные значения
dftr.loc[dftr['Healthcare_1'].isna(),'Healthcare_1'] = dftr['Healthcare_1'].median()
dftr.isna().sum()

In [18]:
dftr['Floor'].value_counts()

In [19]:
dftr['HouseFloor'].value_counts()
# Значения 117 и 99 явно являются выбросными

In [20]:
# Создаем новый бинарный признак для квартир, этаж которых превышает этажность дома
dftr['HouseFloor_outlier'] = 0
dftr.loc[dftr['Floor'] > dftr['HouseFloor'], 'HouseFloor_outlier'] = 1
dftr.loc[dftr['HouseFloor'], 'HouseFloor_outlier'] = 1
dftr['HouseFloor_outlier'].value_counts()

In [21]:
# Заменяем медианой все нулевые значения этажности
dftr.loc[dftr['HouseFloor'] == 0, 'HouseFloor'] = dftr['HouseFloor'].median()
dftr['HouseFloor'].value_counts()

In [22]:
# И все значения, при которых этажность выше этажа
dftr.loc[dftr['HouseFloor'] > dftr['Floor'], 'HouseFloor'] = dftr['HouseFloor'].median()
dftr['HouseFloor'].value_counts()
# В итоге неадекватных значений не осталось

In [23]:
# Заменяем слишком высокие выбросные значения признаков на медиану признака
dftr.loc[dftr['KitchenSquare'] > dftr['KitchenSquare'].quantile(.975), 'KitchenSquare'] = dftr['KitchenSquare'].median()
dftr['KitchenSquare'].value_counts()

In [24]:
# Повторяем заполнение для тестовых данных
dfts.loc[dfts['LifeSquare'].isna(),'LifeSquare'] = dftr['Square'] - dftr['KitchenSquare']
dfts.isna().sum()

In [25]:
# И для этого признака тоже. Применяем медиану из данных на обучение
dfts.loc[dfts['Healthcare_1'].isna(),'Healthcare_1'] = dftr['Healthcare_1'].median()
dfts.isna().sum()

In [26]:
class DataPreprocessing:
    """Подготовка данных"""
    
    def __init__(self):
        self.medians = None
        self.kitchen_square = None
        
    def fit(self, x):
        self.medians = x.median()
        self.kitchen_square = x['KitchenSquare'].quantile(.975)
    
    def transform(self, x):
        x.loc[(x['HouseYear'] > 2020), 'HouseYear'] = 2020
        
        x.loc[x['LifeSquare'].isna(),'LifeSquare'] = x['Square'] - x['KitchenSquare']
        
        x.loc[x['Healthcare_1'].isna(),'Healthcare_1'] = self.medians['Healthcare_1']
        
#        x['HouseFloor_outlier'] = 0
#        x.loc[x['Floor'] > x['HouseFloor'], 'HouseFloor_outlier'] = 1
#        x.loc[x['HouseFloor'], 'HouseFloor_outlier'] = 1
        
        x.loc[x['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        
        x.loc[x['HouseFloor'] > x['Floor'], 'HouseFloor'] = self.medians['HouseFloor']
        
        x.loc[x['KitchenSquare'] > self.kitchen_square, 'KitchenSquare'] = self.medians['KitchenSquare']
        
        x.fillna(self.medians, inplace=True)
        
        return x

In [27]:
# Заменим признаки А и В на бинарные значения
binary_to_numbers = {'A': 1, 'B': 0}

dftr['Ecology_2'] = dftr['Ecology_2'].replace(binary_to_numbers)
dftr['Ecology_3'] = dftr['Ecology_3'].replace(binary_to_numbers)
dftr['Shops_2'] = dftr['Shops_2'].replace(binary_to_numbers)
dftr.head()

In [28]:
def floor_to_cat(x):
    bins = [x['Floor'].min(), 3, 5, 9, 15, x['Floor'].max()]
    x['floor_cat'] = pd.cut(x['Floor'], bins=bins, labels=False)
    
    x['floor_cat'].fillna(-1, inplace=True)
    return x

In [29]:
def year_to_cat(x):
    bins = [x['HouseYear'].min(), 1941, 1945, 1980, 2000, 2010, x['HouseYear'].max()]
    x['year_cat'] = pd.cut(x['HouseYear'], bins=bins, labels=False)
    
    x['year_cat'].fillna(-1, inplace=True)
    return x

In [30]:
dftr = year_to_cat(dftr)
dftr = floor_to_cat(dftr)
dftr.head()

In [31]:
med_price_by_floor_year = dftr.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByFloorYear'})
med_price_by_floor_year.head()

In [32]:
dftr = dftr.merge(med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
dftr.head()

In [33]:
class FeatureGeneration():
    """Генерация новых признаков"""
    
    def __init__(self):
        self.DistrictId = None
        self.Rooms = None
        self.binary_to_numbers = None
        self.house_year_max = None
        self.house_year_min = None
        self.floor_max = None
        self.floor_min = None
        self.med_price_by_floor_year = None
        self.year_cat = None
        self.floor_cat = None
        
    def fit(self, x, y=None):
        
        x = x.copy()
        
        self.binary_to_numbers = {'A': 1, 'B': 0}
        
        dfts = x.copy()
        
 #       if y is not None:
 #           dfts['Price'] = y.values
 #           
 #           self.floor_max = dfts['Floor'].max()
 #           self.floor_min = dfts['Floor'].min()
 #           self.house_year_max = dfts['HouseYear'].max()
 #           self.house_year_min = dfts['HouseYear'].min()
 #           dfts = self.year_to_cat(dfts)
 #           dfts = self.floor_to_cat(dfts)
 #           self.med_price_by_floor_year = dfts.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
 #                                           rename(columns={'Price':'MedPriceByFloorYear'})
 #           self.med_price_by_floor_year_median = self.med_price_by_floor_year['MedPriceByFloorYear'].median()

    def transform(self, x):
        
        x['Ecology_2'] = x['Ecology_2'].map(self.binary_to_numbers)
        x['Ecology_3'] = x['Ecology_3'].map(self.binary_to_numbers)
        x['Shops_2'] = x['Shops_2'].map(self.binary_to_numbers)
        
        x = self.floor_cat(x)
        x = self.year_cat(x)
        
        if self.med_price_by_floor_year is not None:
            x = x.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            x['MedPriceByFloorYear'].fillna(self.med_price_by_floor_year_median, inplace=True)
        
        return x
    
    def floor_to_cat(self, x):
        bins = [self.floor_min, 3, 5, 9, 15, self.floor_max]
        x['floor_cat'] = pd.cut(x['Floor'], bins=bins, labels=False)

        x['floor_cat'].fillna(-1, inplace=True)
        return x
     
    def year_to_cat(self, x):
        bins = [self.house_year_min, 1941, 1945, 1980, 2000, 2010, self.house_year_max]
        x['year_cat'] = pd.cut(x['HouseYear'], bins=bins, labels=False)

        x['year_cat'].fillna(-1, inplace=True)
        return x

In [34]:
dftr.columns.tolist()

In [35]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 
                 'HouseYear', 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2',
                'Social_3', 'Helthcare_2', 'Shops_1', 'Shops_2']
new_feature_names = ['HouseFloor_outlier', 'MedPriceByFloorYear']
target = 'Price'

In [40]:
train_df = dftr
test_df = dfts

x = train_df.drop(columns=target)
y = train_df[target]

In [37]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, shuffle=True, random_state=21)

In [43]:
preprocessor = DataPreprocessing()
preprocessor.fit(train_df)

x_train = preprocessor.transform(train_df)
x_valid = preprocessor.transform(x_valid)
test_df = preprocessor.transform(test_df)

x_train.shape, x_valid.shape, test_df.shape

In [45]:
features_gen = FeatureGeneration()
features_gen.fit(train_df, y_train)

x_train = features_gen.transform(train_df)
x_valid = features_gen.transform(x_valid)
test_df = features_gen.transform(test_df)

x_train.shape, x_valid.shape, test_df.shape

In [46]:
rf_model = RandomForestRegressor(random_state=21, criterion='mse')
rf_model.fit(x_train, y_train)

In [None]:
R2 = r2(x_train, y_train)