# Model

## Import Section

In [1]:
import numpy as np
import pandas as pd

import pickle

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import r2_score
# from sklearn.metrics import accuracy_score

## Path Section

In [2]:
PATH_DATA_TRAIN = 'data/train.csv'
PATH_DATA_TEST = 'data/test.csv'

PATH_DATA_TRAIN_PREPROCESSED = 'data/train_preprocessed.pkl'
PATH_DATA_TEST_PREPROCESSED = 'data/test_preprocessed.pkl'

PATH_VALUE_SQUARE_QV = 'preprocessing/square_qv.pkl'
PATH_VALUE_SQUARE_MEAN = 'preprocessing/square_mean.pkl'

PATH_VALUE_LIFESQUARE_QV_MIN = 'preprocessing/lifesquare_qv_min.pkl'
PATH_VALUE_LIFESQUARE_QV_MAX = 'preprocessing/lifesquare_qv_max.pkl'
PATH_SCALER_LIFESQUARE = 'preprocessing/scaler_lifesquare.pkl'
PATH_MODEL_LIFESQUARE_KNR = 'preprocessing/model_lifesquare_knr.pkl'
PATH_MODEL_LIFESQUARE_GS = 'preprocessing/model_lifesquare_gs.pkl'

PATH_VALUE_KITCHENSQUARE_QV_MIN = 'preprocessing/kitchensquare_qv_min.pkl'
PATH_VALUE_KITCHENSQUARE_QV_MAX = 'preprocessing/kitchensquare_qv_max.pkl'
PATH_SCALER_KITCHENSQUARE = 'preprocessing/scaler_kitchensquare.pkl'
PATH_MODEL_KITCHENSQUARE_KNR = 'preprocessing/model_kitchensquare_knr.pkl'
PATH_MODEL_KITCHENSQUARE_GS = 'preprocessing/model_kitchensquare_gs.pkl'

PATH_MODEL_GBR = 'models/model_gbr.pkl'
PATH_MODEL_GS = 'models/model_gs.pkl'

PATH_DATA_RESULT = 'data/result.csv'

## Function "Reduce Memory Usage"

In [3]:
def reduce_memory_usage(df):
    
    # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
    
    initial_memory_usage = df.memory_usage().sum() / 1024 / 1024
    print(f'Initial memory usage of dataframe:\t{initial_memory_usage:.3} Mb')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            # Можно было бы сделать оптимизацию вплоть до типа "float16", однако данный тип данных,
            # как сообщается от сообщества аналитиков данных, плохо поддерживается некоторыми библиотеками
            
            if str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
        
        else:
            df[col] = df[col].astype('category')
    
    final_memory_usage = df.memory_usage().sum() / 1024 / 1024
    print(f'Final memory usage of dataframe:\t{final_memory_usage:.3} Mb')
    
    comparison = np.round(100 * (initial_memory_usage - final_memory_usage) / initial_memory_usage, 3)
    print(f'Memory usage has been decreased by:\t{comparison} %')
    
    return df

## Data Preprocessing Class

In [4]:
class data_preprocessing():
    def __init__(self):
        self.q_min = 0.2
        self.q_max = 0.8
    
    
    def feature_Id(self, df):
        # Предобработка признака "Id": 
        # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
        pass
    
    
    def feature_DistrictId(self, df):
        # Предобработка признака "DistrictId": замена каждого значения на кол-во этого значения в датасете
        # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
        DistrictId_value_counts = pd.DataFrame(data = df['DistrictId'].value_counts())
        DistrictId_value_counts.reset_index(inplace = True)
        DistrictId_value_counts.rename(columns = {'DistrictId': 'DistrictId_new', 'index': 'DistrictId'}, inplace = True)
        df['DistrictId'] = df.merge(DistrictId_value_counts, how = 'left', on = 'DistrictId')['DistrictId_new']
        return df
    
    
    def feature_Rooms(self, df):
        # Предобработка признака "Rooms": исключение записей с выбросами из датасета
        # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
        df = df.drop(df[(df['Rooms'] < 1) | (df['Rooms'] > 5)].index)
        df = df.drop(df[(df['Rooms'] == 5) & (df['LifeSquare'] / 5 < 6)].index)
        return df
    
    
    def feature_Square(self, df):
        # Предобработка признака "Square": заполнение выбросов медианой малых значений
        # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
        # df = df.drop(df[df['Square'] < 20].index)
        
        qv = df['Square'].quantile(q = 0.005)
        mean = np.round(df['Square'][df['Square'] < qv].median(), 2)
        df['Square'][df['Square'] < qv] = mean
        
        with open(PATH_VALUE_SQUARE_QV, 'wb') as file:
            pickle.dump(qv, file)
        
        with open(PATH_VALUE_SQUARE_MEAN, 'wb') as file:
            pickle.dump(mean, file)
        
        return df
    
    
    def feature_LifeSquare(self, df):
        # Предобработка признака "LifeSquare": обработка выбросов при помощи модели
        # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
        features = ['Rooms', 'Square']
        
        df['Ratio'] = np.round(df['LifeSquare'] / df['Square'], 3)
        
        qv_min = df['Ratio'].quantile(q = self.q_min)
        qv_max = df['Ratio'].quantile(q = self.q_max)
        
        x_train = df[features][(df['Ratio'] >= qv_min) & (df['Ratio'] <= qv_max)].copy()
        y_train = df['LifeSquare'][(df['Ratio'] >= qv_min) & (df['Ratio'] <= qv_max)].copy()
        x_test = df[features][(df['LifeSquare'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)].copy()
              
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)
        
        knr = KNeighborsRegressor()
        params = {'n_neighbors': [i for i in range(3, 51)]}
        gs = GridSearchCV(estimator = knr,
                  param_grid = params,
                  scoring = 'r2',
                  cv = KFold(n_splits = 5, shuffle = True, random_state = 100),
                  n_jobs = -1)
        
        gs.fit(x_train_scaled, y_train)
        y_pred = gs.predict(x_test_scaled)
        
        df['LifeSquare'][(df['LifeSquare'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()
        df = df.drop(labels = 'Ratio', axis = 1)
        df = df.drop(df[df['LifeSquare'] / df['Square'] < 0.4].index)
        
        with open(PATH_VALUE_LIFESQUARE_QV_MIN, 'wb') as file:
            pickle.dump(qv_min, file)
            
        with open(PATH_VALUE_LIFESQUARE_QV_MAX, 'wb') as file:
            pickle.dump(qv_min, file)
                
        with open(PATH_SCALER_LIFESQUARE, 'wb') as file:
            pickle.dump(scaler, file)
        
        with open(PATH_MODEL_LIFESQUARE_KNR, 'wb') as file:
            pickle.dump(knr, file)
        
        with open(PATH_MODEL_LIFESQUARE_GS, 'wb') as file:
            pickle.dump(gs, file)
        
        return df
    
    
    def feature_KitchenSquare(self, df):
        # Предобработка признака "KitchenSquare": обработка выбросов при помощи модели
        # СДЕЛАТЬ ПРОВЕРКУ type(df) == pd.DataFrame
        features = ['Square']
        
        df['Ratio'] = np.round(df['KitchenSquare'] / df['Square'], 3)
        
        qv_min = df['Ratio'].quantile(q = self.q_min)
        qv_max = df['Ratio'].quantile(q = self.q_max)
        
        x_train = df[features][(df['Ratio'] >= qv_min) & (df['Ratio'] <= qv_max)].copy()
        y_train = df['KitchenSquare'][(df['Ratio'] >= qv_min) & (df['Ratio'] <= qv_max)].copy()
        x_test = df[features][(df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)].copy()
              
        scaler = StandardScaler()
        x_train_scaled = scaler.fit_transform(x_train)
        x_test_scaled = scaler.transform(x_test)
        
        knr = KNeighborsRegressor()
        params = {'n_neighbors': [i for i in range(3, 51)]}
        gs = GridSearchCV(estimator = knr,
                  param_grid = params,
                  scoring = 'r2',
                  cv = KFold(n_splits = 5, shuffle = True, random_state = 100),
                  n_jobs = -1)
        
        gs.fit(x_train_scaled, y_train)
        # print(gs.best_params_)
        y_pred = gs.predict(x_test_scaled)
        
        df['KitchenSquare'][(df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()
        df = df.drop(labels = 'Ratio', axis = 1)
        
        with open(PATH_VALUE_KITCHENSQUARE_QV_MIN, 'wb') as file:
            pickle.dump(qv_min, file)
            
        with open(PATH_VALUE_KITCHENSQUARE_QV_MAX, 'wb') as file:
            pickle.dump(qv_min, file)

        with open(PATH_SCALER_KITCHENSQUARE, 'wb') as file:
            pickle.dump(scaler, file)
        
        with open(PATH_MODEL_KITCHENSQUARE_KNR, 'wb') as file:
            pickle.dump(knr, file)
        
        with open(PATH_MODEL_KITCHENSQUARE_GS, 'wb') as file:
            pickle.dump(gs, file)
        
        return df
    
    
    def feature_Floor(self, df):
        # Предобработка признака "Floor": 
        pass
    
    
    def feature_HouseFloor(self, df):
        # Предобработка признака "HouseFloor":
        df = df.drop(labels = 'HouseFloor', axis = 1)
        return df
    
    
    def feature_HouseYear(self, df):
        # Предобработка признака "HouseFloor": 
        df = df.drop(df[df['HouseYear'] > 2020].index)
        return df
    
    
    def feature_Others(self, df):
        # Предобработка остальных признаков
        df = df.drop(labels = 'Healthcare_1', axis = 1)
        df['Helthcare_2'] = df['Helthcare_2'].astype('category')
        df = pd.get_dummies(df)
        df['Rooms'] = df['Rooms'].astype('int32')
        
        df = reduce_memory_usage(df)
        return df
    
    
    def feature_test_Square(self, df):
        with open(PATH_VALUE_SQUARE_QV, 'rb') as file:
            qv = pickle.load(file)
        
        with open(PATH_VALUE_SQUARE_MEAN, 'rb') as file:
            mean = pickle.load(file)
        
        df['Square'][df['Square'] < qv] = mean
        
        return df
    
    
    def feature_test_LifeSquare(self, df):
        features = ['Rooms', 'Square']
        
        with open(PATH_SCALER_LIFESQUARE, 'rb') as file:
            scaler = pickle.load(file)
        
        with open(PATH_MODEL_LIFESQUARE_KNR, 'rb') as file:
            knr = pickle.load(file)
        
        with open(PATH_MODEL_LIFESQUARE_GS, 'rb') as file:
            gs = pickle.load(file)
        
        with open(PATH_VALUE_LIFESQUARE_QV_MIN, 'rb') as file:
            qv_min = pickle.load(file)
            
        with open(PATH_VALUE_LIFESQUARE_QV_MAX, 'rb') as file:
            qv_max = pickle.load(file)
        
        df['Ratio'] = df['LifeSquare'] / df['Square']
        
        x_test = df[features][(df['LifeSquare'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)].copy()
        
        x_test_scaled = scaler.transform(x_test)
        
        y_pred = gs.predict(x_test_scaled)
        
        df['LifeSquare'][(df['Ratio'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()
        
        df = df.drop(labels = 'Ratio', axis = 1)
        return df
    
    
    def feature_test_KitchenSquare(self, df):
        features = ['Square']
        
        with open(PATH_SCALER_KITCHENSQUARE, 'rb') as file:
            scaler = pickle.load(file)
        
        with open(PATH_MODEL_KITCHENSQUARE_KNR, 'rb') as file:
            knr = pickle.load(file)
        
        with open(PATH_MODEL_KITCHENSQUARE_GS, 'rb') as file:
            gs = pickle.load(file)
        
        with open(PATH_VALUE_KITCHENSQUARE_QV_MIN, 'rb') as file:
            qv_min = pickle.load(file)
            
        with open(PATH_VALUE_KITCHENSQUARE_QV_MAX, 'rb') as file:
            qv_max = pickle.load(file)
        
        df['Ratio'] = df['KitchenSquare'] / df['Square']
        
        x_test = df[features][(df['KitchenSquare'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)].copy()
        
        x_test_scaled = scaler.transform(x_test)
        
        y_pred = gs.predict(x_test_scaled)
        
        df['KitchenSquare'][(df['Ratio'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()
        
        df = df.drop(labels = 'Ratio', axis = 1)
        return df
    

## Model Building Class

In [5]:
class model_building():
    def __init__(self):
        pass
    
    def fit(self, df):
        # Wall time: 3min 50s
        
        gbr = GradientBoostingRegressor()
        
        params = {'n_estimators': [50, 100, 200, 300, 400],
                   'max_depth': [3, 5, 7, 10],
                  #'min_samples_leaf': [5, 10, 15, 20, 30]}
                   'min_samples_leaf': [1, 2, 3, 4, 5]}
        
        gs = GridSearchCV(estimator = gbr,
                  param_grid = params,
                  scoring = 'r2',
                  cv = KFold(n_splits = 5, shuffle = True, random_state = 100),
                  n_jobs = -1)
        
        y_train = df['Price']
        x_train = df.drop(labels = ['Id', 'Price'], axis = 1)
        
        gs.fit(x_train, y_train)
        
        with open(PATH_MODEL_GBR, 'wb') as file:
            pickle.dump(gbr, file)
        
        with open(PATH_MODEL_GS, 'wb') as file:
            pickle.dump(gs, file)
        
        return None
    
    
    def predict(self, df):
        with open(PATH_MODEL_GBR, 'rb') as file:
            gbr = pickle.load(file)
        
        with open(PATH_MODEL_GS, 'rb') as file:
            gs = pickle.load(file)
        
        x_test = df.drop(labels = 'Id', axis = 1)
        
        y_pred = gs.predict(x_test)
        
        result = pd.DataFrame(zip(df['Id'], y_pred), columns = ['Id', 'Price'])
        
        return result
        

## Launch

#### Applying function "Reduce Memory Usage" on x_train

In [6]:
x_train = pd.read_csv(PATH_DATA_TRAIN)

In [7]:
x_train = reduce_memory_usage(x_train)

Initial memory usage of dataframe:	1.53 Mb
Final memory usage of dataframe:	0.487 Mb
Memory usage has been decreased by:	68.096 %


In [8]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             10000 non-null  int16   
 1   DistrictId     10000 non-null  int16   
 2   Rooms          10000 non-null  float32 
 3   Square         10000 non-null  float32 
 4   LifeSquare     7887 non-null   float32 
 5   KitchenSquare  10000 non-null  float32 
 6   Floor          10000 non-null  int8    
 7   HouseFloor     10000 non-null  float32 
 8   HouseYear      10000 non-null  int32   
 9   Ecology_1      10000 non-null  float32 
 10  Ecology_2      10000 non-null  category
 11  Ecology_3      10000 non-null  category
 12  Social_1       10000 non-null  int8    
 13  Social_2       10000 non-null  int16   
 14  Social_3       10000 non-null  int16   
 15  Healthcare_1   5202 non-null   float32 
 16  Helthcare_2    10000 non-null  int8    
 17  Shops_1        10000 non-null  i

In [9]:
x_train.to_pickle(PATH_DATA_TRAIN_PREPROCESSED)

#### Applying function "Reduce Memory Usage" on x_test

In [10]:
x_test = pd.read_csv(PATH_DATA_TEST)

In [11]:
x_test = reduce_memory_usage(x_test)

Initial memory usage of dataframe:	0.725 Mb
Final memory usage of dataframe:	0.215 Mb
Memory usage has been decreased by:	70.334 %


In [12]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             5000 non-null   int16   
 1   DistrictId     5000 non-null   int16   
 2   Rooms          5000 non-null   float32 
 3   Square         5000 non-null   float32 
 4   LifeSquare     3959 non-null   float32 
 5   KitchenSquare  5000 non-null   float32 
 6   Floor          5000 non-null   int8    
 7   HouseFloor     5000 non-null   float32 
 8   HouseYear      5000 non-null   int16   
 9   Ecology_1      5000 non-null   float32 
 10  Ecology_2      5000 non-null   category
 11  Ecology_3      5000 non-null   category
 12  Social_1       5000 non-null   int8    
 13  Social_2       5000 non-null   int16   
 14  Social_3       5000 non-null   int16   
 15  Healthcare_1   2623 non-null   float32 
 16  Helthcare_2    5000 non-null   int8    
 17  Shops_1        5000 non-null   in

In [13]:
x_test.to_pickle(PATH_DATA_TEST_PREPROCESSED)

#### Data Preprocessing x_train

In [14]:
dp = data_preprocessing()

In [15]:
x_train = pd.read_pickle(PATH_DATA_TRAIN_PREPROCESSED)

In [16]:
x_train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315777,37.199646,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.904419,1.3195,4.2313,214138.859375
std,4859.01902,43.587592,0.839512,21.058731,86.241211,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517273,1.493601,4.806341,92872.289062
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.777344
25%,4169.5,20.0,1.0,41.774879,22.769833,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.628906
50%,8394.5,36.0,2.0,52.513309,32.781261,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.648438
75%,12592.5,75.0,2.0,65.900627,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.460938
max,16798.0,209.0,19.0,641.065186,7480.592285,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.4375


In [17]:
x_train = dp.feature_DistrictId(x_train)
x_train = dp.feature_Rooms(x_train)
x_train = dp.feature_Square(x_train)
x_train = dp.feature_LifeSquare(x_train)
x_train = dp.feature_KitchenSquare(x_train)
x_train = dp.feature_HouseFloor(x_train)
x_train = dp.feature_HouseYear(x_train)
x_train = dp.feature_Others(x_train)

# Реализовать через метод класса

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Square'][df['Square'] < qv] = mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LifeSquare'][(df['LifeSquare'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()


Initial memory usage of dataframe:	0.628 Mb
Final memory usage of dataframe:	0.524 Mb
Memory usage has been decreased by:	16.667 %


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KitchenSquare'][(df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()


In [18]:
x_train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseYear,Ecology_1,Social_1,...,Ecology_3_B,Helthcare_2_0,Helthcare_2_1,Helthcare_2_2,Helthcare_2_3,Helthcare_2_4,Helthcare_2_5,Helthcare_2_6,Shops_2_A,Shops_2_B
count,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,...,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0,9982.0
mean,8385.489982,242.075135,1.888099,56.146019,34.053085,6.204538,8.525145,1984.848026,0.118819,24.687137,...,0.972551,0.421459,0.218593,0.115808,0.1667,0.028852,0.03867,0.009918,0.082649,0.917351
std,4859.107343,268.790034,0.81108,18.898878,11.487391,3.111516,5.238498,18.407118,0.11907,17.528513,...,0.163397,0.493817,0.413313,0.320011,0.372727,0.167399,0.192816,0.099098,0.275364,0.275364
min,0.0,1.0,1.0,22.379999,12.765132,1.0,1.0,1910.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4172.0,55.0,1.0,41.772802,23.819561,3.9,4.0,1974.0,0.017647,6.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,8395.5,104.0,2.0,52.502558,32.428911,6.0,7.0,1977.0,0.075424,25.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,12594.75,511.0,2.0,65.889467,41.068253,8.26,12.0,2001.0,0.195781,36.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,16798.0,851.0,5.0,200.334534,123.139862,32.0,42.0,2020.0,0.521867,74.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
x_train.shape

(9982, 27)

#### Data Preprocessing x_test

In [20]:
x_test = pd.read_pickle(PATH_DATA_TEST_PREPROCESSED)

In [21]:
x_test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,5000.0,5000.0,5000.0,5000.0,3959.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,2623.0,5000.0,5000.0
mean,8412.5954,51.2792,1.91,56.449501,36.158806,5.9768,8.632,12.601,1984.3926,0.119874,24.9338,5406.9,8.2626,1146.657227,1.3194,4.2428
std,4832.674037,44.179466,0.838594,19.092787,17.825287,9.950018,5.483228,6.789213,18.573149,0.12007,17.532202,4026.614773,23.863762,1044.744263,1.47994,4.777365
min,1.0,0.0,0.0,1.378543,0.33349,0.0,1.0,0.0,1908.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0
25%,4221.75,21.0,1.0,41.90623,23.092026,1.0,4.0,9.0,1973.0,0.019509,6.0,1564.0,0.0,325.0,0.0,1.0
50%,8320.5,37.0,2.0,52.921339,32.925087,6.0,7.0,12.0,1977.0,0.072158,25.0,5285.0,2.0,900.0,1.0,3.0
75%,12598.25,77.0,2.0,66.28513,45.174091,9.0,12.0,17.0,2000.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16795.0,212.0,17.0,223.45369,303.071106,620.0,78.0,99.0,2020.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [22]:
x_test = dp.feature_DistrictId(x_test)
x_test = dp.feature_test_Square(x_test)
x_test = dp.feature_test_LifeSquare(x_test)
x_test = dp.feature_test_KitchenSquare(x_test)
x_test = dp.feature_HouseFloor(x_test)
x_test = dp.feature_Others(x_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Square'][df['Square'] < qv] = mean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LifeSquare'][(df['Ratio'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()


Initial memory usage of dataframe:	0.248 Mb
Final memory usage of dataframe:	0.205 Mb
Memory usage has been decreased by:	17.299 %


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KitchenSquare'][(df['Ratio'].isna() == True) | (df['Ratio'] < qv_min) | (df['Ratio'] > qv_max)] = y_pred.copy()


In [23]:
x_test.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseYear,Ecology_1,Social_1,...,Ecology_3_B,Helthcare_2_0,Helthcare_2_1,Helthcare_2_2,Helthcare_2_3,Helthcare_2_4,Helthcare_2_5,Helthcare_2_6,Shops_2_A,Shops_2_B
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,8412.5954,115.5372,1.91,56.484264,34.255886,6.255756,8.632,1984.3926,0.119874,24.9338,...,0.9702,0.4192,0.2192,0.114,0.171,0.0316,0.0372,0.0078,0.0824,0.9176
std,4832.674037,127.381882,0.838594,19.013458,11.371582,2.564138,5.483228,18.573149,0.12007,17.532202,...,0.170052,0.493478,0.413746,0.317843,0.376547,0.17495,0.189271,0.087981,0.275001,0.275001
min,1.0,1.0,0.0,22.379999,13.86779,2.16,1.0,1908.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4221.75,27.0,1.0,41.90623,23.822372,4.02,4.0,1973.0,0.019509,6.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,8320.5,49.0,2.0,52.921339,32.580275,5.66,7.0,1977.0,0.072158,25.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,12598.25,257.0,2.0,66.28513,41.587746,8.08,12.0,2000.0,0.195781,36.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,16795.0,391.0,17.0,223.45369,98.511337,14.6,78.0,2020.0,0.521867,74.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
x_test.shape

(5000, 26)

#### Model building

In [25]:
mb = model_building()

In [26]:
mb.fit(x_train)

In [27]:
result = mb.predict(x_test)

In [28]:
result.describe()

Unnamed: 0,Id,Price
count,5000.0,5000.0
mean,8412.5954,224043.564425
std,4832.674037,82280.044427
min,1.0,65835.074832
25%,4221.75,169275.173383
50%,8320.5,202570.019583
75%,12598.25,262012.894818
max,16795.0,567723.513677


In [29]:
result.to_csv(PATH_DATA_RESULT, index = False)