In [73]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
import random

In [56]:
housing_data = pd.DataFrame.from_csv('cleaned.csv')

In [57]:
# Calculate the baseline for linear regression


In [58]:
# Identify the relevant features
relevant_features = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'YrSold']

feature_set = housing_data.filter(relevant_features)

prices = housing_data.filter(['SalePrice'])

# Deliberately ignoring features like 1st floor, 2nd floor, garage, masonry veneer area, because a pricey house !=
# high values in all such categories. It depends strongly on the housing type as well. LotArea is a catch-all for those
# values

#housing_features = housing_data[:, []]

#determine strength of correlation for features of interest
#X = pd.DataFrame(housing_data,columns=['OverallQual', 'GarageCars', 'GarageArea', 'YearRemodAdd', 'YearBuilt', 'WoodDeckSF', 'OpenPorchSF', 'Lot Area', 'SalePrice']) 
#corr = np.corrcoef(X,rowvar=False) #rowvar = False b/c columns are features
#print (corr)

In [59]:
splits = [(0.9, 0.1), (0.8, 0.2), (0.7, 0.3), (0.6, 0.4), (0.5, 0.5)]
random_states = [42, 1337, 420, 90210, 24]

In [60]:
def create_and_test_model(dataset, target, split, seed):
    np.random.seed(seed)
    index = np.random.rand(len(dataset)) < split[0]
    train_features = dataset[index]
    test_features = dataset[~index]
    
    train_target = target[index]
    test_target = target[~index]
    
    model = LinearRegression()
    
    model = model.fit(X=train_features, y=train_target)
    predictions = model.predict(X=test_features)
    MSE = sklearn.metrics.mean_squared_error(y_true = test_target, y_pred = predictions)
    
    return model, MSE

Correlations with SalePrice are as follows: OverallQual (.79), GarageCars (.64), GarageArea(.62), YearRemodAdd(.51), YearBuilt(.52), WoodDeckSF(.32), OpenPorchSF(.32), Lot Area (.26)

In [61]:
# Create and test model for linear regression for 5 different train_test splits

In [62]:
for i in range(5):
    split = splits[i]
    seed = random_states[i]
    model, MSE = create_and_test_model(feature_set, prices, split, seed)
    print("The MSE for the split {} is {}".format(split, MSE))

The MSE for the split (0.9, 0.1) is 3215765127.6261983
The MSE for the split (0.8, 0.2) is 2230191638.0781784
The MSE for the split (0.7, 0.3) is 1763258783.6486988
The MSE for the split (0.6, 0.4) is 2377900151.789456
The MSE for the split (0.5, 0.5) is 2137031026.2583964


In [63]:
# Calculate the baseline for KNN model



In [64]:
# Create and test KNN model for 5 different train_test splits

In [65]:
#add column 'After1970' to dataframe to specify if house was built after 1970 (True) or before (False)
housing_data['After1970'] = housing_data['YearBuilt'] > 1970

In [69]:
# Identify the relevant features
### NEED TO DO MORE FEATURE ENGINEERING HERE! ###
KNN_relevant_features = ['YearRemodAdd', 'OverallQual', 'GarageCars']

KNN_feature_set = housing_data.filter(KNN_relevant_features)

After1970 = housing_data.filter(['After1970'])

#determine strength of correlation for features of interest
# X = pd.DataFrame(housing_data,columns=['YearRemodAdd', 'OverallQual', 'GarageCars','YearBuilt']) 
# corr = np.corrcoef(X,rowvar=False) #rowvar = False b/c columns are features
# print (corr)

Correlations with YearBuilt are as follows: YearRemodAdd (.59), OverallQual (.57), GarageCars (.54)

In [79]:
def create_and_test_KNN_model(dataset, target, split, seed):
    np.random.seed(seed)
    index = np.random.rand(len(dataset)) < split[0]
    train_features = dataset[index]
    test_features = dataset[~index]
    
    train_target = target[index]
    test_target = target[~index]
    
    model = KNeighborsClassifier(n_neighbors=1)
    
    model = model.fit(X=train_features, y=train_target)
    predictions = model.predict(X=test_features)
    confusion = confusion_matrix(test_target, predictions)
    
    return model, confusion

In [83]:
#use same splits as in linear regression above
#ravel converts column to 1_D array (avoids a data type conversion warning)

for i in range(0, 5):
    split = splits[i] 
    seed = random_states[i]
    model, confusion = create_and_test_KNN_model(KNN_feature_set, np.ravel(After1970), split, seed)
    print("Confusion matrix for split {} is: \n {}".format(split, confusion))

Confusion matrix for split (0.9, 0.1) is: 
 [[63 11]
 [ 9 68]]
Confusion matrix for split (0.8, 0.2) is: 
 [[116  23]
 [ 21 140]]
Confusion matrix for split (0.7, 0.3) is: 
 [[153  33]
 [ 31 204]]
Confusion matrix for split (0.6, 0.4) is: 
 [[217  27]
 [ 49 258]]
Confusion matrix for split (0.5, 0.5) is: 
 [[291  43]
 [ 39 357]]
