In [3]:
from sklearn import linear_model
import pandas as pd 
from datetime import datetime    
import numpy as np


In [11]:
#read data and basic cleaning i.e. remove ID column and clean up dates
df = pd.read_csv('processed2022-10-31.csv')
df.drop(['Unnamed: 0','ID'],axis=1,inplace=True)
#fill nas
df.fillna(0,inplace=True)
#convert date
df['date']=df['date'].astype('datetime64')
#get day, month and year
df['day']=df['date'].dt.day
df['month']=df['date'].dt.month
df['year']=df['date'].dt.year
df.drop(['date'],axis=1,inplace=True)
df.drop('address',axis=1,inplace=True)
#remove offices
df=df[df['type']!='O']

In [12]:
#Adjust for house price trend
def adjust_for_trend(df):
    '''Adjusts for house price trend by year
    '''
    adjust=df[['year','value']].groupby('year').mean().reset_index()
    latest_value=float(adjust['value'][-1:])
    adjust['adjustment']=adjust['value']/latest_value
    adjust.drop('value',axis=1,inplace=True)
    df=df.merge(adjust)
    df['value']=df['value']/df['adjustment']
    df.drop('adjustment',axis=1,inplace=True)
    return df

adjust=True
if adjust:
    df=adjust_for_trend(df)#Adjust for house price trend



In [19]:
adjust=df[['year','value']].groupby('year').mean().reset_index()
latest_value=float(adjust['value'][-1:])
adjust['adjustment']=adjust['value']/latest_value
adjust.to_csv('house_price_adjustments.csv')

In [13]:
#do time dependent split as this is appropriate
year_split=2019
X_train,y_train=df[df['year']<=year_split].drop('value',axis=1),df[df['year']<=year_split]['value']
X_test,y_test=df[df['year']>year_split].drop('value',axis=1),df[df['year']>year_split]['value']

In [14]:
from catboost import CatBoostRegressor
model=CatBoostRegressor(n_estimators=300,random_state=42,max_depth=10,l2_leaf_reg=10)
cat_features=['zip_code', 'type', 'commercial', 'lease', 'name_number',
       'flat', 'road', 'postcode_area',
       'postcode_sector', 'day']
model.fit(X_train,y_train,cat_features=cat_features,silent=True)

<catboost.core.CatBoostRegressor at 0x7f08e0749520>

In [15]:
for var, val in zip(X_train.columns,model.feature_importances_):
    print(var,val)

zip_code 4.916644253132373
type 19.37927787588498
commercial 0.18771443012605007
lease 1.9792103142121822
name_number 7.594804392670155
flat 10.787517381125717
road 5.087092458370975
year 7.910321601776155
month 2.9469805855267057
postcode_area 15.691438267815354
postcode_sector 16.17720879343436
day 7.341789645924947


In [16]:
train_predictions=model.predict(X_train)
test_predictions=model.predict(X_test)
X_test['predictions']=test_predictions
X_test['actual']=y_test

In [17]:
print("Train R Squared ",1-np.sum(((train_predictions-y_train)**2))/np.sum(((y_train-np.mean(y_train))**2)))
print("Test R Squared ",1-np.sum(((test_predictions-y_test)**2))/np.sum(((y_test-np.mean(y_test))**2)))

Train R Squared  0.7380931430825565
Test R Squared  0.5661560765288804


In [None]:
X_test.to_csv('predictions.csv')