In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('data.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] #removing the first (unnamed) column as it just corresponds to the index of each row
cols = list(df.columns.values) #need to save column labels for later (after imputer applied)
df.head()

Unnamed: 0,bedroom,reception,bathroom,price,address
0,3.0,,1.0,100000,DL14
1,2.0,2.0,1.0,79000,DL4
2,3.0,3.0,,225000,SR8
3,4.0,,,230000,DL2
4,3.0,1.0,1.0,48000,TS24


In [3]:
indexes = np.where(df['price']=="OA")[0] #price column has 5 instances of value "OA", these instances are removed
df.drop(indexes, inplace=True)
df['price'] = df['price'].astype(float) #set price column type to float

In [4]:
imputer = KNNImputer(n_neighbors=2)
address = df.pop("address") #remove address before fitting with imputer (add back to the dataframe later)
df = pd.DataFrame(imputer.fit_transform(df))
df = df.join(pd.DataFrame(address))
df.columns = cols #reset column labels after applying imputer 
df

Unnamed: 0,bedroom,reception,bathroom,price,address
0,3.0,1.0,1.0,100000.0,DL14
1,2.0,2.0,1.0,79000.0,DL4
2,3.0,3.0,1.0,225000.0,SR8
3,4.0,2.0,1.5,230000.0,DL2
4,3.0,1.0,1.0,48000.0,TS24
...,...,...,...,...,...
1814,1.0,1.0,1.0,49950.0,DL15
1815,2.0,1.0,1.0,45000.0,DH9
1816,2.0,1.0,1.0,129950.0,TS18
1817,3.0,1.0,2.0,140000.0,TS20


In [5]:
def drop(data, column, value, regex=True): #regex flag is whether to match via first n characters or exact match to inputted string
    if regex == True:
        indexes = np.where(data[column].str[:len(value)] == value)[0]
    else:
        indexes = np.where(data[column] == value)[0]
        
    # print(df.loc[indexes])
    data.drop(indexes, inplace=True)
    
    return data.reset_index(drop=True) #resetting the row indexes for the dataframe after rows have been removed. 

In [6]:
df = drop(df, "address", "Durham")
df = drop(df, "address", "EH")
df = drop(df, "address", "NE")
df = drop(df, "address", "TS2", False) #we want an exact match to TS2 to be removed, hence regex=False
df = drop(df, "address", "TS5", False)
df

Unnamed: 0,bedroom,reception,bathroom,price,address
0,3.0,1.0,1.0,100000.0,DL14
1,2.0,2.0,1.0,79000.0,DL4
2,3.0,3.0,1.0,225000.0,SR8
3,4.0,2.0,1.5,230000.0,DL2
4,3.0,1.0,1.0,48000.0,TS24
...,...,...,...,...,...
1801,1.0,1.0,1.0,49950.0,DL15
1802,2.0,1.0,1.0,45000.0,DH9
1803,2.0,1.0,1.0,129950.0,TS18
1804,3.0,1.0,2.0,140000.0,TS20


In [7]:
add_columns = pd.get_dummies(df['address'])
add_columns

Unnamed: 0,DH1,DH2,DH3,DH4,DH5,DH6,DH7,DH8,DH9,DL1,...,TS20,TS21,TS22,TS23,TS24,TS25,TS26,TS27,TS28,TS29
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1802,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1804,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [8]:
encoded = df.drop('address', axis=1)
encoded = encoded.join(add_columns)
encoded

Unnamed: 0,bedroom,reception,bathroom,price,DH1,DH2,DH3,DH4,DH5,DH6,...,TS20,TS21,TS22,TS23,TS24,TS25,TS26,TS27,TS28,TS29
0,3.0,1.0,1.0,100000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,2.0,1.0,79000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,3.0,1.0,225000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,2.0,1.5,230000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,1.0,1.0,48000.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801,1.0,1.0,1.0,49950.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1802,2.0,1.0,1.0,45000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1803,2.0,1.0,1.0,129950.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1804,3.0,1.0,2.0,140000.0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [9]:
X = encoded.drop('price', axis=1)
X

Unnamed: 0,bedroom,reception,bathroom,DH1,DH2,DH3,DH4,DH5,DH6,DH7,...,TS20,TS21,TS22,TS23,TS24,TS25,TS26,TS27,TS28,TS29
0,3.0,1.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,2.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,3.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,2.0,1.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,1.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801,1.0,1.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1802,2.0,1.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1803,2.0,1.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1804,3.0,1.0,2.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
for col in X.columns:
    print(col)

print(len(X.columns))

bedroom
reception
bathroom
DH1
DH2
DH3
DH4
DH5
DH6
DH7
DH8
DH9
DL1
DL12
DL13
DL14
DL15
DL16
DL17
DL2
DL3
DL4
DL5
SR7
SR8
TS16
TS17
TS18
TS19
TS20
TS21
TS22
TS23
TS24
TS25
TS26
TS27
TS28
TS29
39


In [11]:
len(['DH1', 'DH2', 'DH3', 'DH4', 'DH5', 'DH6', 'DH7', 'DH8', 'DH9', 'DL1', 'DL12', 'DL13', 'DL14', 'DL15', 'DL16', 'DL17', 'DL2', 'DL3', 'DL4', 'DL5', 'SR7', 'SR8', 'TS16', 'TS17', 'TS18', 'TS19', 'TS20', 'TS21', 'TS22', 'TS23', 'TS24', 'TS25', 'TS26', 'TS27', 'TS28', 'TS29'])

36

In [12]:
y = encoded.pop('price')
y.head()

0    100000.0
1     79000.0
2    225000.0
3    230000.0
4     48000.0
Name: price, dtype: float64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [14]:
def return_encoded_postcode(postcode_str):
   encoded = np.zeros(36)
   postcodes = ['DH1', 'DH2', 'DH3', 'DH4', 'DH5', 'DH6', 'DH7', 'DH8', 'DH9', 'DL1', 'DL12', 'DL13', 'DL14', 'DL15', 'DL16', 'DL17', 'DL2', 'DL3', 'DL4', 'DL5', 'SR7', 'SR8', 'TS16', 'TS17', 'TS18', 'TS19', 'TS20', 'TS21', 'TS22', 'TS23', 'TS24', 'TS25', 'TS26', 'TS27', 'TS28', 'TS29']
   i = 0
   for i in range(36):
      if postcode_str == postcodes[i]:
         encoded[i] = 1
   return list(encoded)

In [15]:
return_encoded_postcode('TS22')

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [16]:
arguments = pd.Series([5, 3, 1.5, 459950] + return_encoded_postcode("TS22"))
arguments = pd.DataFrame(arguments.values.reshape((1,-1)))

In [17]:
forest = RandomForestRegressor()
scaler = StandardScaler()
scaler = scaler.fit_transform(X_train)
forest.fit(X_train, y_train)

RandomForestRegressor()

In [18]:
val = forest.predict(arguments)



ValueError: X has 40 features, but RandomForestRegressor is expecting 39 features as input.

In [None]:
val[0]