In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


# Formating the dataset 

This file converts the dataset into a format that can be read by the models

In [3]:

#Get the data and output the columns

data = pd.read_csv('london_house_prices.csv')

data.columns


Index(['id', 'bedrooms', 'bathrooms', 'tenure', 'garden', 'street',
       'size_sqft', 'price_pounds', 'nearest_station_name',
       'nearest_station_miles', 'postcode_outer'],
      dtype='object')

In [4]:


#Functions to remove unnecessary characters 
# Stops "St. John's" street and "St. Johns" street being read as different 

def tidy_up_string(string):
    
        string = string.replace('\n', '')
        string = string.replace('\r\n', '')
        string = string.replace("'s", '')
        string = string.replace('`s', '')
        string = string.replace("'", '')
        string = string.replace(".", '')
    
        return string
    




## Target Encoding 

Replaces each catagorical variable with the average score for that catagory 

In [5]:
def target_encoding(data, unique_values, name_of_col):
        
    #1 itterate through unique values in the column input
    #2 get the mean of all the scores for that value
    #3 replace all scores values with that mean
    
    values_to_replace = {}
    
    #1
    for value in unique_values:
       
        #2
        prices = (data[data[name_of_col] == value]['price_pounds']).tolist()
        
        values_to_replace[value] = (sum(prices) / len(prices))
        
    #3 
    data[name_of_col] = data[name_of_col].replace(values_to_replace)
    
    return data


## Fix the address column 

the Addresses in the dataset had many different formats. Addresses typically contain 5 lines but sometimes some lines were omitted.
<br><br> Addressees' name / House number
<br> street name
<br>Locality name 
<br>TOWN 
<br>FULL POSTCODE 

The only consistent part was street name so I extracted out the street name for each address. I used .split(",") to split the address up by section. Depending on how many lines of the address were present would change where the street was located in the string. I pulled the street out of the address and ran it through the tidy up function. Finally replaced it in the list


In [6]:
unique_streets = data.street.unique()
area_replace_list = {}

for street in unique_streets:
    
    split_street_name = street.split(", ")
        
    if(len(split_street_name) == 5):
        
        area_replace_list[street] = tidy_up_string(split_street_name[3])
        
    if(len(split_street_name) == 4):
        
        area_replace_list[street] = tidy_up_string(split_street_name[2])

    if(len(split_street_name) == 3):
         
        area_replace_list[street] = tidy_up_string(split_street_name[1])
                    
    if(len(split_street_name) == 2):
         
        area_replace_list[street] = "London"
            
    if(len(split_street_name) == 1):
        
        split_street_name = split_street_name[0].split(',')
        
        area_replace_list[street] = tidy_up_string(split_street_name[0])

               
data['street'] = data['street'].replace(area_replace_list)


Do hot one encoding on one the tenure feature <br>
Drop unneeded rows<br>
Fill in NaNs

In [7]:

#Do hot one encoding on one the tenure feature 
data["tenure_freehold"] = (data["tenure"] ==  "freehold")
data["tenure_leasehold"] = (data["tenure"] ==  "leasehold")

#Drop un needed columns 
data = data.drop("tenure", axis='columns')
data = data.drop("size_sqft", axis='columns')
data = data.drop("id", axis='columns')

#Fill in NaNs
#Postcode is fine to create a new category for NaNs because most of the categories are located in central London 
data["postcode_outer"] = data["postcode_outer"].fillna("X")

#Filling in NaNs with one for number of bedrooms and bathrooms, I’m assuming every location has at least one  
data["bedrooms"] = data["bedrooms"].fillna(1)
data["bathrooms"] = data["bathrooms"].fillna(1)


## Call the Target Encoding Function

In [8]:
data = target_encoding(data, data.nearest_station_name.unique(), 'nearest_station_name')

data = target_encoding(data, data.street.unique(), 'street')

data = target_encoding(data, data.postcode_outer.unique(), "postcode_outer")






## Save data in a new file 

In [9]:
data.to_csv('london_house_prices_ajusted.csv', index=False)