In [125]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
import random
from scipy.stats import boxcox

In [126]:
housing_data = pd.DataFrame.from_csv('cleaned.csv')

In [127]:
# Calculate the baseline for linear regression
prices = housing_data.filter(['SalePrice'])
avg_price = prices.mean()

In [128]:
# Identify the relevant features

columns = housing_data.columns
column_dtypes = housing_data.dtypes
numerical_types = [np.int64, np.int32, np.float32, np.float64]

valid_col = [columns[i] for i in range(len(columns)) if column_dtypes[i] in numerical_types]
X = pd.DataFrame(housing_data, columns = valid_col)
corr = np.corrcoef(X, rowvar=False)

# Get SalePrice corrcoef with everything else
sale_price_corr_coef = corr[-1][:-1]

significant_col = []
significant_col_values = []
sig_threshold = 0.5 # arbitrarily setting it at >=0.5
print("Below is the correlation coefficients for SalePrice with each of the other numerical features:\n")
for i in range(len(sale_price_corr_coef)):
    corr_coef = sale_price_corr_coef[i]
    print("{}. {}: {}".format(i+1, valid_col[i], corr_coef))
    if abs(corr_coef) >= sig_threshold:
        significant_col.append(valid_col[i])
        significant_col_values.append(corr_coef)

print("\nBelow are the columns (and correlation coefficients) that have been deemed significant (absolute coefficient >= 0.5) and will be used " +
      "in the linear regression model")

for i in range(len(significant_col)):
    print("{}. {}: {}".format(i+1, significant_col[i], significant_col_values[i]))
    
feature_set = housing_data.filter(significant_col)



Below is the correlation coefficients for SalePrice with each of the other numerical features:

1. MSSubClass: -0.08353247161509611
2. LotFrontage: 0.209137300361708
3. LotArea: 0.2639554484435689
4. OverallQual: 0.7908701032855194
5. OverallCond: -0.07747076838683664
6. YearBuilt: 0.5229250946272646
7. YearRemodAdd: 0.507087387087986
8. BsmtFinSF1: 0.3858519610870544
9. BsmtFinSF2: -0.013811048393577966
10. BsmtUnfSF: 0.21285275404422782
11. TotalBsmtSF: 0.6150306002600914
12. 1stFlrSF: 0.6056811685298112
13. 2ndFlrSF: 0.32045452502532046
14. LowQualFinSF: -0.025516478206112737
15. GrLivArea: 0.7088612518306577
16. BsmtFullBath: 0.22645875233836776
17. BsmtHalfBath: -0.01664413437848131
18. FullBath: 0.5606660611742326
19. HalfBath: 0.2853242963661683
20. BedroomAbvGr: 0.16817449421857766
21. KitchenAbvGr: -0.13580936177277597
22. TotRmsAbvGrd: 0.5338292886465191
23. Fireplaces: 0.46676213136684097
24. GarageCars: 0.6399663562678245
25. GarageArea: 0.622866688949611
26. WoodDeckSF: 0.

In [129]:
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [42, 1337, 420, 90210, 24]

In [130]:
def create_and_test_model(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = LinearRegression()
    model = model.fit(X=X_train, y=y_train)
    predictions = model.predict(X=X_test)
    MSE = sklearn.metrics.mean_squared_error(y_true = y_test, y_pred = predictions)
    RMSE = np.sqrt(MSE)
    num_targets = len(y_test)
    baseline_pred = [avg_price] * num_targets
    BaselineMSE = sklearn.metrics.mean_squared_error(y_true = y_test, y_pred = baseline_pred)
    BaselineRMSE = np.sqrt(BaselineMSE)
    return model, RMSE, BaselineRMSE

In [131]:
# Create and test model for linear regression for 5 different train_test splits

In [132]:
for i in range(5):
    test_size = splits[i]
    seed = random_states[i]
    model, RMSE, BaselineRMSE = create_and_test_model(feature_set, prices, test_size, seed)
    print("The RMSE for the split {} is {}".format(test_size, RMSE))
    improvement = (BaselineRMSE - RMSE) * 100 / BaselineRMSE 
    print("This model performs {:.2f}% better than the baseline model".format(improvement))
    print("\n")

The RMSE for the split 0.1 is 31899.34418813559
This model performs 59.70% better than the baseline model


The RMSE for the split 0.2 is 34524.61572284951
This model performs 55.12% better than the baseline model


The RMSE for the split 0.3 is 41071.365101906216
This model performs 45.40% better than the baseline model


The RMSE for the split 0.4 is 38010.686585263145
This model performs 54.15% better than the baseline model


The RMSE for the split 0.5 is 42517.341371332455
This model performs 47.46% better than the baseline model




In [133]:
#add column 'After1970' to dataframe to specify if house was built after 1970 (True) or before (False)
housing_data['After1970'] = housing_data['YearBuilt'] > 1970

In [134]:
#Baseline for KNN:
#This should be the value that is most common in the dependent variable.
baseline = (housing_data['After1970'].describe()['freq'])/(housing_data['After1970'].describe()['count'])
print('KNN Baseline Value:', baseline)

KNN Baseline Value: 0.530542210021


In [135]:
# Create and test KNN model for 5 different train_test splits

In [136]:
#determine strength of correlation for features of interest
# X = pd.DataFrame(housing_data,columns=['YearRemodAdd', 'OverallQual', 'GarageCars','YearBuilt']) 
# corr = np.corrcoef(X,rowvar=False) #rowvar = False b/c columns are features
# print (corr)

# Identify the relevant features
KNN_relevant_features = ['YearRemodAdd', 'OverallQual', 'GarageCars']

# Normalize relevant features to between 0 and 1 for KNN distance calculation
KNN_feature_set = housing_data.filter(KNN_relevant_features)
KNN_feature_set['YearRemodAdd'] = (KNN_feature_set['YearRemodAdd'] - KNN_feature_set['YearRemodAdd'].min())/(KNN_feature_set['YearRemodAdd'].max() - KNN_feature_set['YearRemodAdd'].min())
KNN_feature_set['OverallQual'] = (KNN_feature_set['OverallQual'] - KNN_feature_set['OverallQual'].min())/(KNN_feature_set['OverallQual'].max() - KNN_feature_set['OverallQual'].min())
KNN_feature_set['GarageCars'] = (KNN_feature_set['GarageCars'] - KNN_feature_set['GarageCars'].min())/(KNN_feature_set['GarageCars'].max() - KNN_feature_set['GarageCars'].min())

After1970 = housing_data.filter(['After1970'])

Correlations with YearBuilt are as follows: YearRemodAdd (.59), OverallQual (.57), GarageCars (.54)

In [137]:
def create_and_test_KNN_model(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = KNeighborsClassifier(n_neighbors=5)
    model = model.fit(X=X_train, y=y_train)
    predictions = model.predict(X=X_test)
    confusion = confusion_matrix(y_test, predictions)
    
    return model, confusion

In [140]:
#use same splits as in linear regression above
#ravel converts column to 1_D array (avoids a data type conversion warning)

for i in range(0, 5):
    test_size = splits[i] 
    seed = random_states[i]
    model, confusion = create_and_test_KNN_model(KNN_feature_set, np.ravel(After1970), split, seed)
    print("Confusion matrix for split {} is: \n {}".format(test_size, confusion))
    TP = confusion[0,0]
    FN = confusion[1,0]
    FP = confusion[0,1]
    TN = confusion[1,1]
    accuracy = (TP + TN)/(TP + FN + FP + TN)
    improvement = (accuracy - baseline) * 100 / baseline 
    print("This model performs {:.2f}% better than the baseline model".format(improvement))
    print("\n")
    

Confusion matrix for split 0.1 is: 
 [[279  56]
 [ 27 367]]
This model performs 67.03% better than the baseline model


Confusion matrix for split 0.2 is: 
 [[298  43]
 [ 25 363]]
This model performs 70.90% better than the baseline model


Confusion matrix for split 0.3 is: 
 [[295  55]
 [ 23 356]]
This model performs 68.32% better than the baseline model


Confusion matrix for split 0.4 is: 
 [[310  42]
 [ 32 345]]
This model performs 69.35% better than the baseline model


Confusion matrix for split 0.5 is: 
 [[299  42]
 [ 43 345]]
This model performs 66.51% better than the baseline model


