# Task 

Материалы к проекту (файлы): train.csv test.csv

Задание: Используя данные из train.csv, построить модель для предсказания цен на недвижимость (квартиры). С помощью полученной модели предсказать цены для квартир из файла test.csv.

Целевая переменная: Price

Основная метрика: R2 - коэффициент детерминации (sklearn.metrics.r2_score)

Вспомогательная метрика: MSE - средняя квадратичная ошибка (sklearn.metrics.mean_squared_error)

# Get Dataset

In [37]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('train.csv', delimiter=',')
test = pd.read_csv('test.csv', delimiter=',')

In [4]:
train = pd.read_csv('train.csv', delimiter=',')
test = pd.read_csv('test.csv', delimiter=',')

In [5]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

In [6]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

In [7]:
test.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             5000 non-null   int64  
 1   DistrictId     5000 non-null   int64  
 2   Rooms          5000 non-null   float64
 3   Square         5000 non-null   float64
 4   LifeSquare     3959 non-null   float64
 5   KitchenSquare  5000 non-null   float64
 6   Floor          5000 non-null   int64  
 7   HouseFloor     5000 non-null   float64
 8   HouseYear      5000 non-null   int64  
 9   Ecology_1      5000 non-null   float64
 10  Ecology_2      5000 non-null   object 
 11  Ecology_3      5000 non-null   object 
 12  Social_1       5000 non-null   int64  
 13  Social_2       5000 non-null   int64  
 14  Social_3       5000 non-null   int64  
 15  Healthcare_1   2623 non-null   float64
 16  Helthcare_2    5000 non-null   int64  
 17  Shops_1        5000 non-null   int64  
 18  Shops_2 

# Missing data

In [8]:
for col in train.columns:
    pct_missing = np.mean(train[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

Id - 0%
DistrictId - 0%
Rooms - 0%
Square - 0%
LifeSquare - 21%
KitchenSquare - 0%
Floor - 0%
HouseFloor - 0%
HouseYear - 0%
Ecology_1 - 0%
Ecology_2 - 0%
Ecology_3 - 0%
Social_1 - 0%
Social_2 - 0%
Social_3 - 0%
Healthcare_1 - 48%
Helthcare_2 - 0%
Shops_1 - 0%
Shops_2 - 0%
Price - 0%


In [9]:
med_lifesquare = train['LifeSquare'].median()
med_healthcare_1 = train['Healthcare_1'].median()
print(med_lifesquare)
print(med_healthcare_1) 
train['LifeSquare'] = train['LifeSquare'].fillna(med_lifesquare )
train['Healthcare_1'] = train['Healthcare_1'].fillna(med_healthcare_1)

32.781260192155735
900.0


In [10]:
for col in train.columns:
    pct_missing = np.mean(train[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

Id - 0%
DistrictId - 0%
Rooms - 0%
Square - 0%
LifeSquare - 0%
KitchenSquare - 0%
Floor - 0%
HouseFloor - 0%
HouseYear - 0%
Ecology_1 - 0%
Ecology_2 - 0%
Ecology_3 - 0%
Social_1 - 0%
Social_2 - 0%
Social_3 - 0%
Healthcare_1 - 0%
Helthcare_2 - 0%
Shops_1 - 0%
Shops_2 - 0%
Price - 0%


In [11]:
train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,36.26604,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1026.3589,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,76.609981,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,746.662828,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,25.527399,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,830.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,41.427234,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,990.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


# Rooms

In [13]:
train['Rooms'].value_counts()

2.0     3880
1.0     3705
3.0     2235
4.0      150
5.0       18
0.0        8
10.0       2
19.0       1
6.0        1
Name: Rooms, dtype: int64

In [14]:
train['Rooms_outlier'] = 0
train.loc[(train['Rooms'] == 0) | (train['Rooms'] >= 6), 'Rooms_outlier'] = 1
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,Rooms_outlier
0,11809,27,3.0,115.027311,32.78126,10.0,4,10.0,2014,0.075424,...,B,11,3097,0,900.0,0,0,B,305018.871089,0
1,3013,22,1.0,39.832524,23.169223,8.0,7,8.0,1966,0.118537,...,B,30,6207,1,1183.0,1,0,B,177734.553407,0
2,8215,1,3.0,78.342215,47.671972,10.0,2,17.0,1988,0.025609,...,B,33,5261,0,240.0,3,1,B,282078.72085,0
3,2352,1,1.0,40.409907,32.78126,1.0,10,22.0,1977,0.007122,...,B,1,264,0,900.0,0,1,B,168106.00763,0
4,13866,94,2.0,64.285067,38.562517,9.0,16,16.0,1972,0.282798,...,B,33,8667,2,900.0,0,6,B,343995.102962,0


In [15]:
rooms_median = train['Rooms'].median()
train.loc[train['Rooms'] == 0, 'Rooms'] = 1
train.loc[train['Rooms'] >= 6, 'Rooms'] = rooms_median

In [16]:
train['Rooms'].value_counts()

2.0    3884
1.0    3713
3.0    2235
4.0     150
5.0      18
Name: Rooms, dtype: int64

# Square

In [17]:
train['Square'].describe()

count    10000.000000
mean        56.315775
std         21.058732
min          1.136859
25%         41.774881
50%         52.513310
75%         65.900625
max        641.065193
Name: Square, dtype: float64

In [18]:
train['Square'].quantile(0.025)

32.10518291149681

In [19]:
condition = (train['Square'].isna()) | (train['Square'] > train['Square'].quantile(0.975)) | (train['Square'] < train['Square'].quantile(0.025))
        
train.loc[condition, 'Square'] = train['Square'].median()

In [20]:
train['Square'].describe()

count    10000.000000
mean        55.108599
std         15.215053
min         32.105270
25%         42.432730
50%         52.513310
75%         64.591720
max        100.904263
Name: Square, dtype: float64

# KitchenSquare

In [21]:
train['KitchenSquare'].value_counts()

1.0       2460
8.0       1306
5.0       1169
10.0      1075
6.0       1038
9.0        843
0.0        697
7.0        609
12.0       249
11.0       233
13.0        67
14.0        51
4.0         39
15.0        31
3.0         22
16.0        16
20.0        14
17.0        12
19.0        11
18.0         6
2.0          4
22.0         3
30.0         2
43.0         2
41.0         2
112.0        2
25.0         2
51.0         2
37.0         2
58.0         2
32.0         2
21.0         1
73.0         1
75.0         1
36.0         1
27.0         1
63.0         1
1970.0       1
54.0         1
53.0         1
60.0         1
26.0         1
66.0         1
39.0         1
29.0         1
78.0         1
31.0         1
84.0         1
48.0         1
96.0         1
42.0         1
40.0         1
23.0         1
72.0         1
35.0         1
62.0         1
123.0        1
2014.0       1
Name: KitchenSquare, dtype: int64

In [22]:
train['KitchenSquare'].quantile(.95), train['KitchenSquare'].quantile(.05)

(12.0, 0.0)

In [23]:
kitchen_square_median = train['KitchenSquare'].median()
condition = (train['KitchenSquare'].isna()) \
             | (train['KitchenSquare'] > train['KitchenSquare'].quantile(.95))
        
train.loc[condition, 'KitchenSquare'] = kitchen_square_median

train.loc[train['KitchenSquare'] < 5, 'KitchenSquare'] = 5

In [24]:
train['KitchenSquare'].value_counts()

5.0     4391
8.0     1306
6.0     1294
10.0    1075
9.0      843
7.0      609
12.0     249
11.0     233
Name: KitchenSquare, dtype: int64

# LifeSquare


In [25]:
train['LifeSquare'].describe()

count    10000.000000
mean        36.266040
std         76.609981
min          0.370619
25%         25.527399
50%         32.781260
75%         41.427234
max       7480.592129
Name: LifeSquare, dtype: float64

In [26]:
train['LifeSquare'].quantile(0.0255)

15.222056716697058

In [27]:
(train['LifeSquare'] < train['KitchenSquare']).sum()

162

In [28]:
condition = (train['LifeSquare'].isna()) | (train['LifeSquare'] > train['LifeSquare'].quantile(0.975)) | \
(train['LifeSquare'] < train['LifeSquare'].quantile(0.025)) |  (train['LifeSquare'] < train['KitchenSquare'])
        
train.loc[condition, 'LifeSquare'] = train['LifeSquare'].median()

In [29]:
train['LifeSquare'].describe()

count    10000.000000
mean        34.534440
std         11.451088
min         15.146290
25%         27.982505
50%         32.781260
75%         39.424685
max         78.384631
Name: LifeSquare, dtype: float64

# HouseYear

In [30]:
def train_fix_house_year_manual(df):
    train.loc[train['HouseYear'] == 20052011, 'HouseYear'] = int((2005 + 2011) / 2)
    train.loc[train['HouseYear'] == 4968, 'HouseYear'] = 1968
    return df

In [31]:
train

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,Rooms_outlier
0,11809,27,3.0,52.513310,32.781260,10.0,4,10.0,2014,0.075424,...,B,11,3097,0,900.0,0,0,B,305018.871089,0
1,3013,22,1.0,39.832524,23.169223,8.0,7,8.0,1966,0.118537,...,B,30,6207,1,1183.0,1,0,B,177734.553407,0
2,8215,1,3.0,78.342215,47.671972,10.0,2,17.0,1988,0.025609,...,B,33,5261,0,240.0,3,1,B,282078.720850,0
3,2352,1,1.0,40.409907,32.781260,5.0,10,22.0,1977,0.007122,...,B,1,264,0,900.0,0,1,B,168106.007630,0
4,13866,94,2.0,64.285067,38.562517,9.0,16,16.0,1972,0.282798,...,B,33,8667,2,900.0,0,6,B,343995.102962,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1260,61,2.0,49.090728,33.272626,6.0,3,12.0,1981,0.300323,...,B,52,10311,6,900.0,1,9,B,119367.455796,0
9996,16265,27,2.0,64.307684,37.038420,9.0,13,0.0,1977,0.072158,...,B,2,629,1,900.0,0,0,A,199715.148807,0
9997,2795,178,1.0,52.513310,16.555363,5.0,3,5.0,1958,0.460556,...,B,20,4386,14,900.0,1,5,B,165953.912580,0
9998,14561,21,1.0,32.330292,22.326870,5.0,3,9.0,1969,0.194489,...,B,47,8004,3,125.0,3,5,B,171842.411855,0


# Train и Test

In [32]:
print(train.columns)

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'Rooms_outlier'],
      dtype='object')


In [33]:
feat = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare']
x = train[feat]
y = train['Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [34]:
lr = LinearRegression()

In [35]:
lr.fit(x_train, y_train)

LinearRegression()

# Prediction

In [38]:
forest = RandomForestRegressor()
forest.fit(x, y)

RandomForestRegressor()

In [39]:
itog_test = test[feat]
itog_test = itog_test.fillna(0)
pred = pd.DataFrame(forest.predict(itog_test), columns=['pred'])
pred

Unnamed: 0,pred
0,156441.075552
1,134960.273678
2,283995.387651
3,200030.472110
4,146964.785080
...,...
4995,206138.707259
4996,439624.032982
4997,158988.232094
4998,167064.801227
