In [4]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import random

In [5]:
housing_data = pd.DataFrame.from_csv('cleaned.csv')
prices = housing_data.filter(['SalePrice'])

In [19]:
# Calculate the baseline for linear regression
avg_price = prices.mean()

In [7]:
# Identify the relevant features
relevant_features = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'YrSold']

feature_set = housing_data.filter(relevant_features)



# Deliberately ignoring features like 1st floor, 2nd floor, garage, masonry veneer area, because a pricey house !=
# high values in all such categories. It depends strongly on the housing type as well. LotArea is a catch-all for those
# values

#housing_features = housing_data[:, []]

#determine strength of correlation for features of interest
#X = pd.DataFrame(housing_data,columns=['OverallQual', 'GarageCars', 'GarageArea', 'YearRemodAdd', 'YearBuilt', 'WoodDeckSF', 'OpenPorchSF', 'Lot Area', 'SalePrice']) 
#corr = np.corrcoef(X,rowvar=False) #rowvar = False b/c columns are features
#print (corr)

In [8]:
splits = [(0.9, 0.1), (0.8, 0.2), (0.7, 0.3), (0.6, 0.4), (0.5, 0.5)]
random_states = [42, 1337, 420, 90210, 24]

In [35]:
def create_and_test_model(dataset, target, split, seed):
    np.random.seed(seed)
    index = np.random.rand(len(dataset)) < split[0]
    train_features = dataset[index]
    test_features = dataset[~index]
    
    train_target = target[index]
    test_target = target[~index]
    model = LinearRegression()
    model = model.fit(X=train_features, y=train_target)
    predictions = model.predict(X=test_features)
    MSE = sklearn.metrics.mean_squared_error(y_true = test_target, y_pred = predictions)
    l = ((test_target - predictions)**2).sum() / len(test_target)
    num_targets = len(test_target)
    baseline_pred = [avg_price] * num_targets
    BaselineMSE = sklearn.metrics.mean_squared_error(y_true = test_target, y_pred = baseline_pred)
    return model, MSE, BaselineMSE

In [10]:
# Create and test model for linear regression for 5 different train_test splits

In [37]:
for i in range(5):
    split = splits[i]
    seed = random_states[i]
    model, MSE, BaselineMSE = create_and_test_model(feature_set, prices, split, seed)
    print("The MSE for the split {} is {}".format(split, MSE))
    improvement = (BaselineMSE - MSE) * 100 / BaselineMSE 
    print("This MSE is {:.2f}% better than the baseline model".format(improvement))
    print("\n")

The MSE for the split (0.9, 0.1) is 3215765127.6261687
This MSE is 55.21% better than the baseline model


The MSE for the split (0.8, 0.2) is 2230191638.078183
This MSE is 65.04% better than the baseline model


The MSE for the split (0.7, 0.3) is 1763258783.6486974
This MSE is 70.23% better than the baseline model


The MSE for the split (0.6, 0.4) is 2377900151.7894597
This MSE is 60.20% better than the baseline model


The MSE for the split (0.5, 0.5) is 2137031026.2583961
This MSE is 66.24% better than the baseline model




In [11]:
# Calculate the baseline for KNN model

In [None]:
# Identify the relevant features

In [9]:
# Create and test KNN model for 5 different train_test splits