In [35]:
from datetime import datetime

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

seed = 6

In [2]:
start = datetime.now()
df = pd.read_csv('data/ohenc.csv')
end = datetime.now()
print(end - start)
df.head()

0:03:01.989829


Unnamed: 0.1,Unnamed: 0,Price,Date_of_Transfer,County,Property_Type_T,Property_Type_S,Property_Type_F,Property_Type_O,Old_New_Y,Duration_L,...,County_BEDFORD,County_CITY OF DERBY,County_MEDWAY,County_CITY OF PLYMOUTH,County_MONMOUTHSHIRE,"County_BOURNEMOUTH, CHRISTCHURCH AND POOLE",County_ISLES OF SCILLY,County_WEST NORTHAMPTONSHIRE,County_NORTH NORTHAMPTONSHIRE,PPD_Category_Type_B
0,0,70000,1995-07-07 00:00,MILTON KEYNES,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,44500,1995-02-03 00:00,TYNE AND WEAR,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,56500,1995-01-13 00:00,ESSEX,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,58000,1995-07-28 00:00,WEST MIDLANDS,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,51000,1995-06-28 00:00,WEST MIDLANDS,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
start = datetime.now()
df = df.drop(columns=['Unnamed: 0'])
end = datetime.now()
print(end - start)

0:00:24.791135


In [9]:
county_ohenc_cols = df.columns[df.columns.str.startswith('County_')]
df = df.drop(columns=county_ohenc_cols)
df.shape

(26467225, 11)

In [11]:
df = df.drop(columns=['Date_of_Transfer'])
df.head()

Unnamed: 0,Price,County,Property_Type_T,Property_Type_S,Property_Type_F,Property_Type_O,Old_New_Y,Duration_L,Duration_U,PPD_Category_Type_B
0,70000,MILTON KEYNES,0,0,0,0,0,0,0,0
1,44500,TYNE AND WEAR,1,0,0,0,0,0,0,0
2,56500,ESSEX,1,0,0,0,0,0,0,0
3,58000,WEST MIDLANDS,1,0,0,0,0,0,0,0
4,51000,WEST MIDLANDS,0,1,0,0,0,0,0,0


In [32]:
def get_y_pred(county_df):
    print(county_df['County'].unique())
    df = county_df.drop(columns=['County'])

    X = df.drop('Price', axis=1)
    y = df['Price']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
    
    rf = RandomForestRegressor(max_depth=200, random_state=seed)    
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    return (y_pred, y_test)

In [33]:
county_preds = dict()
for grp in df.groupby('County'):
    county, county_df = grp
    county_preds[county] = get_y_pred(county_df)


['AVON']
['BATH AND NORTH EAST SOMERSET']
['BEDFORD']
['BEDFORDSHIRE']
['BERKSHIRE']
['BLACKBURN WITH DARWEN']
['BLACKPOOL']
['BLAENAU GWENT']
['BOURNEMOUTH']
['BOURNEMOUTH, CHRISTCHURCH AND POOLE']
['BRACKNELL FOREST']
['BRIDGEND']
['BRIGHTON AND HOVE']
['BUCKINGHAMSHIRE']
['CAERPHILLY']
['CAMBRIDGESHIRE']
['CARDIFF']
['CARMARTHENSHIRE']
['CENTRAL BEDFORDSHIRE']
['CEREDIGION']
['CHESHIRE']
['CHESHIRE EAST']
['CHESHIRE WEST AND CHESTER']
['CITY OF BRISTOL']
['CITY OF DERBY']
['CITY OF KINGSTON UPON HULL']
['CITY OF NOTTINGHAM']
['CITY OF PETERBOROUGH']
['CITY OF PLYMOUTH']
['CLEVELAND']
['CLWYD']
['CONWY']
['CORNWALL']
['COUNTY DURHAM']
['CUMBRIA']
['DARLINGTON']
['DENBIGHSHIRE']
['DERBYSHIRE']
['DEVON']
['DORSET']
['DURHAM']
['DYFED']
['EAST RIDING OF YORKSHIRE']
['EAST SUSSEX']
['ESSEX']
['FLINTSHIRE']
['GLOUCESTERSHIRE']
['GREATER LONDON']
['GREATER MANCHESTER']
['GWENT']
['GWYNEDD']
['HALTON']
['HAMPSHIRE']
['HARTLEPOOL']
['HEREFORD AND WORCESTER']
['HEREFORDSHIRE']
['HERTFORDSHIRE

In [37]:
counties = []
mse = []
r2 = []
mape = []

for county, tup in county_preds.items():
    counties.append(county)
    mse.append(mean_squared_error(*tup))
    r2.append(r2_score(*tup))
    mape.append(mean_absolute_percentage_error(*tup))

In [44]:
results = {
    'County': counties,
    'MSE': mse,
    'R2': r2,
    'MAPE': mape
}

rdf = pd.DataFrame(results)
rdf.to_csv('RF_per_county.csv', index=False)