In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [3]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

train = train.copy()
valid = valid.copy()

In [4]:
def prepare_HouseYear(df):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = 1977
    df.loc[df['HouseYear'] < 1900, 'HouseYear'] = 1977
    return df

In [5]:
def prepare_Rooms(df):
    df.loc[df['Rooms'] >= 5, 'Rooms'] = 2
    df.loc[df['Rooms'] == 0, 'Rooms'] = 2
    return df

In [6]:
def prepare_Square(df):
    df.loc[df['Square'] > 250, 'Square'] = 45
    df.loc[df['Square'] < 20, 'Square'] = 45
    return df

In [7]:
def prepare_LifeSquare(df):
    df.loc[df['LifeSquare'] > 250.0, 'LifeSquare'] = df['Square']*0.92
    df.loc[df['LifeSquare'] < 22, 'LifeSquare'] = df['Square']*0.92
    df.loc[df['LifeSquare'].isnull(), 'LifeSquare'] = df['Square']*0.92
    df.loc[df['LifeSquare'] > df['Square'], 'LifeSquare'] = df['Square']*0.92
    return df

In [8]:
def prepare_KitchenSquare(df):
    df.loc[df['KitchenSquare'] > 20, 'KitchenSquare'] = 5+2*df['Rooms']
    df.loc[df['KitchenSquare'] <= 3, 'KitchenSquare'] = 5+2*df['Rooms']
    return df

In [9]:
def prepare_HouseFloor(df):
    df.loc[df['HouseFloor'] < df['Floor'], 'HouseFloor'] = df['Floor']
    return df

In [10]:
def normalisation(df):
    cols_for_scale = ['Rooms', 'Square' , 'Price']
    scaler = MinMaxScaler()
    df[cols_for_scale] = scaler.fit_transform(df[cols_for_scale])
    return df

In [11]:
def FirstFloor(df):
    df.loc[df['Floor'] <= 1, 'FirstFloor'] = 0
    df.loc[df['Floor'] > 1, 'FirstFloor'] = 1
    return df

In [12]:
def Kitchen(df):
    df.loc[df['KitchenSquare'] <= 10, 'Kitchen'] = 0
    df.loc[df['KitchenSquare'] > 10, 'Kitchen'] = 1
    return df

In [13]:
def Year(df):
    df.loc[df['HouseYear'] <= 2010, 'Year'] = 0
    df.loc[df['HouseYear'] > 2010, 'Year'] = 1
    return df

In [14]:
def Novostroy(df):
    df.loc[df['HouseYear'] <= 2016, 'Novostroy'] = 0
    df.loc[df['HouseYear'] > 2016, 'Novostroy'] = 1
    return df

In [15]:
def MeanPricePerSquare(df):
    df_mean = train.loc[train['DistrictCount'] > 2, :].groupby(['DistrictId', 'Rooms']).agg({'Price': 'sum', 'Square':'sum'})
    df_mean = df_mean.rename(columns={'Price':'SumPrice'})
    df_mean = df_mean.rename(columns={'Square':'SumSquare'})
    df_mean['MeanPricePerSquare'] = df_mean['SumPrice']/df_mean['SumSquare']
    df_mean = df_mean.drop('SumPrice', axis=1)
    df_mean = df_mean.drop('SumSquare', axis=1)
    df = pd.merge(df, df_mean, on=['DistrictId', 'Rooms'], how='left')
    df.loc[df['MeanPricePerSquare'].isnull(), 'MeanPricePerSquare'] = df_mean['MeanPricePerSquare'].mean()
    return df

In [16]:
def DistrictCount(df):
    df_d = df.groupby('DistrictId').agg({'DistrictId': 'count'})
    df_d = df_d.rename(columns={'DistrictId':'DistrictCount'})
    df = pd.merge(df, df_d, on='DistrictId', how='left')
    return df

In [17]:
def MeanPricePerSquare2(df):
    df_mean2 = train.loc[train['DistrictCount'] > 2, :].groupby('DistrictId').agg({'Price': 'sum', 'Square':'sum'})
    df_mean2 = df_mean2.rename(columns={'Price':'SumPrice'})
    df_mean2 = df_mean2.rename(columns={'Square':'SumSquare'})
    df_mean2['MeanPricePerSquare2'] = df_mean2['SumPrice']/df_mean2['SumSquare']
    df_mean2 = df_mean2.drop('SumPrice', axis=1)
    df_mean2 = df_mean2.drop('SumSquare', axis=1)
    df = pd.merge(df, df_mean2, on='DistrictId', how='left')
    df.loc[df['MeanPricePerSquare2'].isnull(), 'MeanPricePerSquare2'] = df_mean2['MeanPricePerSquare2'].mean()
    return df

In [18]:
def MeanSquareRoom(df):
    df['MeanSquareRoom'] = df['Square']/df['Rooms']
    return df

In [19]:
train['Ecology_2'] = (train['Ecology_2'] == 'A').astype(int)
train['Ecology_3'] = (train['Ecology_3'] == 'A').astype(int)
train['Shops_2'] = (train['Shops_2'] == 'A').astype(int)

valid['Ecology_2'] = (valid['Ecology_2'] == 'A').astype(int)
valid['Ecology_3'] = (valid['Ecology_3'] == 'A').astype(int)
valid['Shops_2'] = (valid['Shops_2'] == 'A').astype(int)

test['Ecology_2'] = (test['Ecology_2'] == 'A').astype(int)
test['Ecology_3'] = (test['Ecology_3'] == 'A').astype(int)
test['Shops_2'] = (test['Shops_2'] == 'A').astype(int)

In [20]:
train = train.drop('Healthcare_1', axis=1)
valid = valid.drop('Healthcare_1', axis=1)
test = test.drop('Healthcare_1', axis=1)

In [21]:
prepare_HouseYear(train)
prepare_Rooms(train)
prepare_Square(train)
prepare_LifeSquare(train)
prepare_KitchenSquare(train)
prepare_HouseFloor(train)
train = DistrictCount(train)
FirstFloor(train)
#train = MeanPricePerSquare(train)
train = MeanPricePerSquare2(train)
MeanSquareRoom(train)
Kitchen(train)
Year(train)
Novostroy(train)
#normalisation(train)

prepare_HouseYear(valid)
prepare_Rooms(valid)
prepare_Square(valid)
prepare_LifeSquare(valid)
prepare_KitchenSquare(valid)
prepare_HouseFloor(valid)
valid = DistrictCount(valid)
FirstFloor(valid)
#valid = MeanPricePerSquare(valid)
valid = MeanPricePerSquare2(valid)
MeanSquareRoom(valid)
Kitchen(valid)
Year(valid)
Novostroy(valid)
#normalisation(valid)

prepare_HouseYear(test)
prepare_Rooms(test)
prepare_Square(test)
prepare_LifeSquare(test)
prepare_KitchenSquare(test)
prepare_HouseFloor(test)
test = DistrictCount(test)
FirstFloor(test)
#test = MeanPricePerSquare(test)
test = MeanPricePerSquare2(test)
MeanSquareRoom(test)
Kitchen(test)
Year(test)
Novostroy(test)
#normalisation(test)

train.head(5)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Shops_1,Shops_2,Price,DistrictCount,FirstFloor,MeanPricePerSquare2,MeanSquareRoom,Kitchen,Year,Novostroy
0,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,...,2,0,88504.384965,394,1.0,2225.14588,41.68138,0.0,1.0,0.0
1,5621,23,3.0,163.495333,161.504222,12.0,5,5.0,1977,0.014073,...,0,0,207007.956663,394,1.0,2225.14588,54.498444,1.0,0.0,0.0
2,235,87,1.0,39.710131,36.53332,8.0,4,17.0,1986,0.100456,...,6,1,182126.280899,21,1.0,4177.354863,39.710131,0.0,0.0,0.0
3,16258,48,3.0,96.056784,88.372241,11.0,15,15.0,2017,0.041125,...,10,0,524365.550705,62,1.0,4997.97294,32.018928,1.0,1.0,1.0
4,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,...,3,0,322048.43399,28,1.0,4189.330218,26.398608,0.0,0.0,0.0


### Model

In [22]:
feats = [
    'Rooms', 
    'Square', 
    'KitchenSquare', 
    'HouseYear', 
    'LifeSquare', 
    'Floor', 
    'HouseFloor', 
    'FirstFloor', 
    #'MeanPricePerSquare', 
    'Novostroy',
    'MeanSquareRoom',
    'MeanPricePerSquare2',
    #'DistrictCount',
    'Kitchen',
    'Year',
    'Ecology_1', 
    'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 
    'Social_3','Helthcare_2', 'Shops_1', 
    'Shops_2']

In [23]:
base_model = Ridge(alpha=1.0)
base_model.fit(train.loc[:, feats], train['Price'])
pred_train = base_model.predict(train.loc[:, feats])
pred_valid = base_model.predict(valid.loc[:, feats])

In [24]:
r2_score(train['Price'], pred_train)

0.7035938977204665

In [25]:
r2_score(valid['Price'], pred_valid)

0.6516269645257696

In [26]:
test['Price'] = base_model.predict(test.loc[:, feats])

In [27]:
test.loc[:, ['Id', 'Price']].to_csv('SSerova_predictions.csv', index=None)