In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sberbank-russian-housing-market/data_dictionary.txt
/kaggle/input/sberbank-russian-housing-market/train.csv.zip
/kaggle/input/sberbank-russian-housing-market/macro.csv.zip
/kaggle/input/sberbank-russian-housing-market/test.csv.zip
/kaggle/input/sberbank-russian-housing-market/sample_submission.csv.zip
/kaggle/input/cleaned-csv-files/train_cleaned.csv
/kaggle/input/cleaned-csv-files/test_cleaned.csv


In [3]:
pd.set_option('display.max_columns',500)

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn import metrics,preprocessing
import lightgbm as lgb

In [5]:
def preprocess(data):
    data.loc[data['full_sq'].isnull(),'full_sq']=50
    data['relative_floor']=data['floor']/data['max_floor'].astype(float)
    data['room_size']=data['life_sq']/data['num_room'].astype(float)
    data['month']=data['timestamp'].dt.month.astype(int)
    data['year']=data['timestamp'].dt.year.astype(int)
    data['dayOfWeek']=data['timestamp'].dt.dayofweek.astype(int)
    data['bought_minus_built']=data['timestamp'].dt.year.astype(int)-data['build_year']
    data['product_type']=data['product_type'].map(lambda x: 1 if x=='OwnerOccupier' else 0)
    data=data.applymap(lambda x : x if x!='Yes' else 1)
    data=data.applymap(lambda x : x if x!='No' else 0)
    return data

In [6]:
train=pd.read_csv('../input/cleaned-csv-files/train_cleaned.csv',index_col=['id'],parse_dates=['timestamp'])
test=pd.read_csv('../input/cleaned-csv-files/test_cleaned.csv',index_col=['id'],parse_dates=['timestamp'])
print(train.shape,test.shape)


(30470, 291) (7662, 290)


In [7]:
print(train.dtypes.value_counts())
print(test.dtypes.value_counts())

int64             155
float64           120
object             15
datetime64[ns]      1
dtype: int64
int64             157
float64           117
object             15
datetime64[ns]      1
dtype: int64


In [8]:
train=preprocess(train)
train.shape

(30470, 297)

In [9]:
test=preprocess(test)
test.shape

(7662, 296)

In [10]:
train.dtypes.value_counts()
test.dtypes.value_counts()


int64             161
float64           120
object             14
datetime64[ns]      1
dtype: int64

In [11]:
train.product_type.value_counts()

0    19447
1    11023
Name: product_type, dtype: int64

In [12]:
#Price level multipliers here. I've copied it from Andy Harless script:
#    https://www.kaggle.com/aharless/exercising-the-exorcism

rate_2015_q2 = 1
rate_2015_q1 = rate_2015_q2 / 0.9932
rate_2014_q4 = rate_2015_q1 / 1.0112
rate_2014_q3 = rate_2014_q4 / 1.0169
rate_2014_q2 = rate_2014_q3 / 1.0086
rate_2014_q1 = rate_2014_q2 / 1.0126
rate_2013_q4 = rate_2014_q1 / 0.9902
rate_2013_q3 = rate_2013_q4 / 1.0041
rate_2013_q2 = rate_2013_q3 / 1.0044
rate_2013_q1 = rate_2013_q2 / 1.0104  # This is 1.002 (relative to mult), close to 1:
rate_2012_q4 = rate_2013_q1 / 0.9832  #     maybe use 2013q1 as a base quarter and get rid of mult?
rate_2012_q3 = rate_2012_q4 / 1.0277
rate_2012_q2 = rate_2012_q3 / 1.0279
rate_2012_q1 = rate_2012_q2 / 1.0279
rate_2011_q4 = rate_2012_q1 / 1.076
rate_2011_q3 = rate_2011_q4 / 1.0236
rate_2011_q2 = rate_2011_q3 / 1
rate_2011_q1 = rate_2011_q2 / 1.011


# train 2015
train['average_q_price'] = 1

train_2015_q2_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2015_q2_index, 'average_q_price'] = rate_2015_q2

train_2015_q1_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2015_q1_index, 'average_q_price'] = rate_2015_q1


# train 2014
train_2014_q4_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2014_q4_index, 'average_q_price'] = rate_2014_q4

train_2014_q3_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2014_q3_index, 'average_q_price'] = rate_2014_q3

train_2014_q2_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2014_q2_index, 'average_q_price'] = rate_2014_q2

train_2014_q1_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2014_q1_index, 'average_q_price'] = rate_2014_q1


# train 2013
train_2013_q4_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2013_q4_index, 'average_q_price'] = rate_2013_q4

train_2013_q3_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2013_q3_index, 'average_q_price'] = rate_2013_q3

train_2013_q2_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2013_q2_index, 'average_q_price'] = rate_2013_q2

train_2013_q1_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2013_q1_index, 'average_q_price'] = rate_2013_q1


# train 2012
train_2012_q4_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2012_q4_index, 'average_q_price'] = rate_2012_q4

train_2012_q3_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2012_q3_index, 'average_q_price'] = rate_2012_q3

train_2012_q2_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2012_q2_index, 'average_q_price'] = rate_2012_q2

train_2012_q1_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2012_q1_index, 'average_q_price'] = rate_2012_q1


# train 2011
train_2011_q4_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2011_q4_index, 'average_q_price'] = rate_2011_q4

train_2011_q3_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2011_q3_index, 'average_q_price'] = rate_2011_q3

train_2011_q2_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2011_q2_index, 'average_q_price'] = rate_2011_q2

train_2011_q1_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2011_q1_index, 'average_q_price'] = rate_2011_q1

train['price_doc'] = train['price_doc'] * train['average_q_price']


In [13]:

Y=train['price_doc']
train=train.drop(['sub_area','ecology','price_doc','average_q_price','timestamp'],axis=1)

In [14]:
test=test.drop(['sub_area','ecology','timestamp'],axis=1)

In [15]:


for c in train.columns:
    if train[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        
for c in test.columns:
    if test[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(test[c].values)) 
        test[c] = lbl.transform(list(test[c].values))

In [16]:
print(train.shape,test.shape)

(30470, 293) (7662, 293)


In [17]:
class two_estimator:
    def __init__(self,owner_params,invest_params):
        self.est1=xgb.XGBRegressor(**owner_params)
        self.est2=xgb.XGBRegressor(**invest_params)
        
    def preprocess_owner(self,data,mode):
         assert (data['product_type'].values==1).all()
         if mode=='predict':
            data.loc[data['full_sq'].isnull(),'full_sq'] = 50
         return data
    
    def preprocess_invest(self,data,mode):
         if mode=='predict':
                assert (data['product_type'].values==0).all()
                data.loc[data['full_sq'].isnull(),'full_sq'] = 50
         return data
    
    def fit(self,X,y):
        X1 = X[X['product_type']==1]
        X2 = X
        X1 = self.preprocess_owner(X1,'train')
        y1 = y.loc[X1.index.values]/X1['full_sq']
        X2 = self.preprocess_invest(X2,'train')
        y2 = y
        if len(X1)>0:
            self.est1.fit(X1,y1)
        if len(X2)>0:
            self.est2.fit(X2,y2)
    
    def predict(self,X):
        X1=X[X['product_type']==1]
        X2=X[X['product_type']==0]
        owner_index = X1.index.values
        investment_index = X.index.drop(owner_index).values
        X1 = self.preprocess_owner(X1,'predict')
        X2 = self.preprocess_invest(X2,'predict')
        res=pd.DataFrame(index=X.index)
        if len(X1)>0:
            pred1 = self.est1.predict(X1)
            res.loc[owner_index,0] = pred1*X1['full_sq']
        if len(X2)>0:
            pred2 = self.est2.predict(X2)
            res.loc[investment_index,0] = pred2    
        return res[0].values.flatten()

In [28]:
estimators=[]
o=0
for i in range(50):
    owner_params = {
            'n_estimators':200,
            'eta':0.05,
            'max_depth':6,
            'min_child_weight':1,
            'subsample':0.8,
            'colsample_bytree':0.9,
            'colsample_bylevel':1,
            'reg_alpha':0,
            'reg_lambda':1,
            'seed':i,
            'eval_metric':'rmse',
            'objective':'reg:squarederror',
            'nthread':8
           }
    invest_params = {
            'n_estimators':200,
            'eta':0.05,
            'max_depth':6,
            'min_child_weight':1,
            'subsample':0.8,
            'colsample_bytree':0.9,
            'colsample_bylevel':1,
            'reg_alpha':0,
            'reg_lambda':1,
            'seed':i,
            'eval_metric':'rmse',
            'objective':'reg:squarederror',
            'nthread':8
            }
    est=two_estimator(owner_params,invest_params)
    est.fit(train,Y)
    estimators.append(est)
    o=o+1
    if o%10==0:
        print(o)
        

10
20
30
40
50


In [1]:
preds=np.zeros(shape=test.values.shape[0])
cnt=0
for e in estimators:
    result=e.predict(test)
    preds=preds+result
    if cnt%10==0:
        print(cnt)
    cnt+=1
preds/=len(estimators)


NameError: name 'np' is not defined

In [22]:
import zipfile
with zipfile.ZipFile('../input/sberbank-russian-housing-market/sample_submission.csv.zip') as f:
    f.extractall('.')

In [27]:
sample=pd.read_csv('../working/sample_submission.csv')
pred1 = preds*0.96
sample['price_doc']=pred1
sample.to_csv('model2.csv',index=False)
sample.head()

Unnamed: 0,id,price_doc
0,30474,5509215.0
1,30475,8219325.0
2,30476,5391261.0
3,30477,6270258.0
4,30478,5169080.0
