In [2]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
import sklearn

In [3]:
# Import the dataset and check layout
df1 = pd.read_csv("/London.csv")
df1.head()

FileNotFoundError: [Errno 2] No such file or directory: 'London.csv'

In [None]:
# Checking for number of rows
df1.shape

In [None]:
# Checking for number of locations
len(df1.Location.unique())

In [None]:
# Cretaing a column for area code to better divide London locations
df1['Area'] = df1['Postal Code'].str.split(' ').str[0]

In [None]:
# Checking number of areas
area_count =df1.groupby('Area')['Area'].agg('count').sort_values(ascending=False)
area_count

In [None]:
# Checking for areas with less than 10 properties
len(area_count[area_count <= 10])

In [None]:
# Create list of areas with less than 10 properties
area_less_than_10 = area_count[area_count <= 10]
area_less_than_10

In [None]:
# Turn Areas with less than 10 properties into other area
# df1.Location = df1.Location.apply(lambda x: 'other' if x in area_less_than_10 else x)

In [None]:
# Checking for null values
df1.isnull().sum()

In [None]:
# Dropping null values
# df2 = df1.dropna()
# df2.isnull().sum()

df2 =df1.copy()

In [None]:
# Checking for any possible incorrectly input of data
df2['No. of Bedrooms'].unique()

In [None]:
# Checking to see if properties with large numbers of bedrooms are valid
df2.loc[df2['No. of Bedrooms'] > 8]

In [None]:
# Checking for the reason for properties to have zero bedrooms
df2.loc[df2['No. of Bedrooms'] == 0]

In [None]:
# Testing to see any incorrectly input area values
df2['Area in sq ft'].unique()

In [None]:
# Creating a function to identify if values are a float (correct format)
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
#  Applying function to create a table of all values which are not float values 
df2[~df2['Area in sq ft'].apply(is_float)]

In [None]:
# Create new dataframe to create a price per squarefoot
df3 = df2.copy()
df3['Price per sqft'] =df3['Price']/df3['Area in sq ft']

In [None]:
df3.head()

In [None]:
# Checking for propeties below threshold of 300 sq ft per bedroom
df3[df3['Area in sq ft']/df3['No. of Bedrooms']<300].head()

In [None]:
# Remove outliers below threshold
df4 = df3[~(df3['Area in sq ft']/df3['No. of Bedrooms']<300)]
df4.shape

In [None]:
# Check for outliers in Price per sq ft
df4['Price per sqft'].describe()

In [None]:
# Create function to remove price per sq ft outliers of two standard deviations (outside 95%)
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('Area'):
        m = np.mean(subdf['Price per sqft'])
        st = np.std(subdf['Price per sqft'])
        reduced_df = subdf[(subdf['Price per sqft']>(m-2*st)) & (subdf['Price per sqft']<=(m+2*st))]
        df_out = pd.concat([df_out,reduced_df], ignore_index=True)
    return df_out

In [None]:
# Apply function to dataframe
df5 = remove_pps_outliers(df4)
df5.shape

In [None]:
def plot_scatter(df,location):
    bedrooms2 = df[(df['Area']==location) & (df['No. of Bedrooms']==2)]
    bedrooms3 = df[(df['Area']==location) & (df['No. of Bedrooms']==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bedrooms2['Area in sq ft'], bedrooms2['Price'], color ='blue', label = ' 2 Bedrooms', s=50)
    plt.scatter(bedrooms3['Area in sq ft'], bedrooms3['Price'], color ='red', label = ' 3 Bedrooms', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()
    

In [None]:
plot_scatter(df5, 'SW11')

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('Area'):
        bedrooms_stats = {}
        for bedrooms, bedrooms_df in location_df.groupby('No. of Bedrooms'):
            bedrooms_stats[bedrooms] = {
                'mean': np.mean(bedrooms_df['Price per sqft']),
                'std': np.std(bedrooms_df['Price per sqft'])*2,
                'count': bedrooms_df.shape[0]
            }
        for bedrooms, bedrooms_df in location_df.groupby('No. of Bedrooms'):
            stats = bedrooms_stats.get(bedrooms-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bedrooms_df[bedrooms_df['Price per sqft']<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df6 = remove_bhk_outliers(df5)

df6.shape

In [None]:
plot_scatter(df6, 'SW11')

In [None]:
# Checking to see where the majority of properties lay in terms of Size (Square Feet)
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df6["Price per sqft"], rwidth =0.8)
plt.xlabel("Price Per Sqaure Feet")
plt.ylabel("Count")

In [None]:
# Checking for outliers in the amount of bathrooms in a property
df6["No. of Bathrooms"].unique()

In [None]:
# Checking for properties outsdie the rule of a property not having the same numbers of bathrooms as bedrooms +2
df6[df6["No. of Bathrooms"]> df6["No. of Bedrooms"]+2]

In [None]:
# Selct only required columns for model
df7 = df6.loc[:, ['Price','Area in sq ft','No. of Bedrooms', 'No. of Bathrooms', 'No. of Receptions', 'Area']]

In [None]:
# Encoding the categoric variable Area to be used for regression
dummies = pd.get_dummies(df7.Area)

In [None]:
# Joining the encoded 'Area' column to the original dataframe
df8 = pd.concat([df7,dummies],axis = 'columns')
df8.head()

In [None]:
# Removing original 'Area' column
df9 = df8.drop('Area', axis ='columns')
df9.head()

In [None]:
# Creating X data without price as that is what is being predicted 
X = df9.drop('Price', axis='columns')
X.head()

In [None]:
 # Creating y data made up of the property prices needed to be predicted
y = df9.Price
y.head()

In [None]:
# Creating a testing and training datatset for the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [None]:
# Testing the models accuracy 
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [None]:
# Carrying out cross validation to measure the accuracy of the model
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

# Negative values can be ignored and positive values lie between 69.5% and 80.9% acccuracy

In [None]:
# Finding the best model between lasso regression and decsiion tree using GridSearchCV to perform hyper-parameter tuning
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {       
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })
        
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

# From the results the lasso regression model is the most accurate model

In [None]:
# Creating the lasso regression model with the optimum parameters
lrm = Lasso(alpha=1.0, selection = 'random', tol=1e-1)
lrm.fit(X, y)

In [None]:
# Creating function to create a price predicition
def predict_price(Area,sqft,bed,bath,rec):
    loc_index = np.where(X.columns==Area)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bed
    x[2] = bath
    x[3] = rec
    if loc_index >= 0:
        x[loc_index] = 1

    return lrm.predict([x])[0]

In [None]:
# Testing the price predictor
predict_price('SW11',1000,2,2,2)

In [None]:
import pickle
with open('londonhouse__prices_model.pickle','wb') as f:
    pickle.dump(lrm,f)