In [167]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline

In [168]:
data = pd.read_csv('train.csv')

In [169]:
test = pd.read_csv('test.csv')

In [170]:
data = data.drop('Healthcare_1', axis=1)

In [171]:
test = test.drop('Healthcare_1', axis=1)

In [172]:
data = data.loc[data['Rooms']<10, :]
data = data.loc[data['Price'].between(30000, 600000), :]
data = data.loc[data['Square'].between(15, 200), :]
data = data.loc[data['HouseYear'].between(1900, 2018), :]
data = data.loc[data['Rooms'].between(1, 10), :]

In [173]:
data = pd.get_dummies(data)

In [174]:
test = pd.get_dummies(test)

In [175]:
data_with_ls = data[data['LifeSquare'].notna()]
rooms_ls_dict = data_with_ls.groupby('Rooms')['LifeSquare'].mean()
data_without_ls = data[~data['LifeSquare'].notna()]
for index, row in data.iterrows():
    if np.isnan(row['LifeSquare']):
        data.loc[index, 'LifeSquare'] = rooms_ls_dict[row['Rooms']]

In [176]:
test_with_ls = test[test['LifeSquare'].notna()]
rooms_ls_dict_test = test_with_ls.groupby('Rooms')['LifeSquare'].mean()
test_without_ls = test[~test['LifeSquare'].notna()]
for index, row in test.iterrows():
    if np.isnan(row['LifeSquare']):
        test.loc[index, 'LifeSquare'] = rooms_ls_dict_test[row['Rooms']]

In [177]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [178]:
def myround(x, base=20):
    return int(base * round(float(x)/base))

train['Square_Class'] = train['Square'].apply(myround)
valid['Square_Class'] = valid['Square'].apply(myround)
test['Square_Class'] = test['Square'].apply(myround)

np.where(data['Floor'] > data['HouseFloor'], data['Floor'], data['HouseFloor']);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [179]:
grouped = train.groupby(['Square_Class', 'Rooms'])[['Price']].mean().reset_index().rename(columns={'Price': 'mean_price'})

In [180]:
test = pd.merge(test, grouped, on=['Square_Class', 'Rooms'], how='left')

In [181]:
test['mean_price'] = test['mean_price'].fillna(test['mean_price'].mean())

In [182]:
train_extended = pd.merge(train, grouped, on=['Square_Class', 'Rooms'], how='left')
valid_extended = pd.merge(valid, grouped, on=['Square_Class', 'Rooms'], how='left')

In [183]:
valid_extended['mean_price'] = valid_extended['mean_price'].fillna(valid_extended['mean_price'].mean())

In [184]:
ft = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'HouseFloor', 
      'Social_3', 'Helthcare_2', 'Shops_1', 'Ecology_2_A',
       'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B', 'DistrictId', 'mean_price']

In [185]:
lr = LinearRegression()

In [186]:
lr.fit(train_extended[ft], train['Price'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [187]:
pred = lr.predict(valid_extended.loc[:, ft])

In [188]:
r2_score(valid['Price'], pred)

0.5201270942039795

In [160]:
rfreg = RandomForestRegressor(max_depth=13, random_state=42, n_estimators=1000)

In [189]:
reg2 = rfreg.fit(train_extended[ft], train['Price'])

In [190]:
rf_predict = reg2.predict(valid_extended.loc[:, ft])

In [191]:
r2_score(valid['Price'], rf_predict)

0.7113292426659977

In [193]:
pred_test = reg2.predict(test.loc[:, ft])

In [194]:
pred_test

array([172212.40917722, 226819.17747853, 183763.00951855, ...,
       316821.92633781, 205748.59972525, 174274.43594933])

In [197]:
pred_test.shape

(5000,)

In [198]:
test['Price'] = pred_test

In [199]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,Square_Class,mean_price,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,...,0,0,1,0,1,0,1,40,195797.153786,172212.409177
1,15856,74,2.0,69.263183,36.222168,1.0,6,1.0,1977,0.075779,...,2,0,1,0,1,0,1,60,219137.847233,226819.177479
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,...,5,0,1,0,1,0,1,20,118369.153852,183763.009519
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,...,3,0,1,0,1,0,1,80,258654.276184,312277.45432
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,...,0,0,1,0,1,1,0,40,156426.297707,138527.654833


In [200]:
test.loc[:, ['Id', 'Price']].to_csv('Kurs_project_task\ElenaPopova_predictions.csv', index=None)