In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
data = data.drop('Healthcare_1', axis=1)

In [5]:
test = test.drop('Healthcare_1', axis=1)

In [6]:
data = data.loc[data['Rooms']<10, :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['Square'].between(15, 200), :]
data = data.loc[data['HouseYear'].between(1900, 2018), :]
data = data.loc[data['Rooms'].between(1, 10), :]

In [7]:
data = pd.get_dummies(data)

In [8]:
test = pd.get_dummies(test)

In [10]:
data_with_ls = data[data['LifeSquare'].notna()]
rooms_ls_dict = data_with_ls.groupby('Rooms')['LifeSquare'].mean()
data_without_ls = data[~data['LifeSquare'].notna()]
for index, row in data.iterrows():
    if np.isnan(row['LifeSquare']):
        data.loc[index, 'LifeSquare'] = rooms_ls_dict[row['Rooms']]

In [11]:
test_with_ls = test[test['LifeSquare'].notna()]
rooms_ls_dict_test = test_with_ls.groupby('Rooms')['LifeSquare'].mean()
test_without_ls = test[~test['LifeSquare'].notna()]
for index, row in test.iterrows():
    if np.isnan(row['LifeSquare']):
        test.loc[index, 'LifeSquare'] = rooms_ls_dict_test[row['Rooms']]

In [12]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [13]:
grouped = train.groupby(['DistrictId', 'Rooms'])[['Price']].mean().reset_index().rename(columns={'Price': 'mean_price'})

In [14]:
train = pd.merge(train, grouped, on=['DistrictId', 'Rooms'], how='left')

In [15]:
valid = pd.merge(valid, grouped, on=['DistrictId', 'Rooms'], how='left')

In [16]:
valid['mean_price'] = valid['mean_price'].fillna(valid['mean_price'].mean())

In [17]:
test = pd.merge(test, grouped, on=['DistrictId', 'Rooms'], how='left')

In [18]:
test['mean_price'] = test['mean_price'].fillna(test['mean_price'].mean())

In [26]:
ft = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'HouseFloor', 
      'Social_3', 'Helthcare_2', 'Shops_1', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B', 'DistrictId', 'mean_price']

In [27]:
lr = LinearRegression()

In [28]:
lr.fit(train[ft], train['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
pred = lr.predict(valid.loc[:, ft])

In [30]:
r2_score(valid['Price'], pred)

0.561968551472849

In [31]:
rfreg = RandomForestRegressor(max_depth=13, random_state=42, n_estimators=1000)

In [32]:
reg2 = rfreg.fit(train[ft], train['Price'])

In [33]:
rf_predict = reg2.predict(valid.loc[:, ft])

In [34]:
r2_score(valid['Price'], rf_predict)

0.6623992241501055

In [40]:
pred_test = reg2.predict(test.loc[:, ft])

In [43]:
pred_test

array([164995.07618753, 237551.50404123, 191137.8702041 , ...,
       308092.2389519 , 197392.52397366, 169168.48317627])

In [44]:
pred_test.shape

(5000,)

In [45]:
test['Price'] = pred_test

In [46]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Helthcare_2,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,mean_price,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,0,1,0,1,0,1,171007.447981,164995.076188
1,15856,74,2.0,69.263183,36.222168,1.0,6,1.0,1977,0.075779,...,0,2,0,1,0,1,0,1,233722.89376,237551.504041
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,...,5,5,0,1,0,1,0,1,213305.876441,191137.870204
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,3,0,1,0,1,0,1,201387.019443,257077.600005
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,0,1,0,1,1,0,124060.636162,140057.855082


In [52]:
test.loc[:, ['Id', 'Price']].to_csv('Kurs_project_task\ElenaPopova_predictions.csv', index=None)