# INFO 1998 Project B
## Caleb Chiam, Fareeza Hasan, and Sarah Nadeau

In [44]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix
import random
from scipy.stats import boxcox

In [45]:
#import cleaned dataset from last project
housing_data = pd.DataFrame.from_csv('cleaned.csv')

## Linear Regression Model

In [46]:
# Calculate the baseline for linear regression
prices = housing_data.filter(['SalePrice'])
avg_price = prices.mean()

In [47]:
# Identify the relevant features

columns = housing_data.columns
column_dtypes = housing_data.dtypes
numerical_types = [np.int64, np.int32, np.float32, np.float64]

valid_col = [columns[i] for i in range(len(columns)) if column_dtypes[i] in numerical_types]
print(valid_col)
X = pd.DataFrame(housing_data, columns = valid_col)
corr = np.corrcoef(X, rowvar=False)

# Get SalePrice corrcoef with everything else
sale_price_corr_coef = corr[-1][:-1]

significant_col = []
significant_col_values = []
sig_threshold = 0.5 # arbitrarily setting it at >=0.5
print("Below is the correlation coefficients for SalePrice with each of the other numerical features:\n")
for i in range(len(sale_price_corr_coef)):
    corr_coef = sale_price_corr_coef[i]
    print("{}. {}: {}".format(i+1, valid_col[i], corr_coef))
    if abs(corr_coef) >= sig_threshold:
        significant_col.append(valid_col[i])
        significant_col_values.append(corr_coef)

print("\nBelow are the columns (and correlation coefficients) that have been deemed significant (absolute coefficient >= 0.5) and will be used " +
      "in the linear regression model")

for i in range(len(significant_col)):
    print("{}. {}: {}".format(i+1, significant_col[i], significant_col_values[i]))
    
feature_set = housing_data.filter(significant_col)



['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
Below is the correlation coefficients for SalePrice with each of the other numerical features:

1. MSSubClass: -0.08353247161509614
2. LotFrontage: 0.20913730036170808
3. LotArea: 0.26395544844356883
4. OverallQual: 0.790870103285519
5. OverallCond: -0.07747076838683668
6. YearBuilt: 0.5229250946272648
7. YearRemodAdd: 0.5070873870879856
8. BsmtFinSF1: 0.38585196108705444
9. BsmtFinSF2: -0.013811048393577989
10. BsmtUnfSF: 0.2128527540442278
11. TotalBsmtSF: 0.6150306002600916
12. 1stFlrSF: 0.6056811685298111


In [48]:
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [42, 1337, 420, 90210, 24]

In [49]:
def create_and_test_model(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = LinearRegression()
    model = model.fit(X=X_train, y=y_train)
    predictions = model.predict(X=X_test)
    MSE = sklearn.metrics.mean_squared_error(y_true = y_test, y_pred = predictions)
    RMSE = np.sqrt(MSE)
    num_targets = len(y_test)
    baseline_pred = [avg_price] * num_targets
    BaselineMSE = sklearn.metrics.mean_squared_error(y_true = y_test, y_pred = baseline_pred)
    BaselineRMSE = np.sqrt(BaselineMSE)
    return model, RMSE, BaselineRMSE

In [50]:
# Create and test model for linear regression for 5 different train_test splits

In [51]:
for i in range(5):
    test_size = splits[i]
    seed = random_states[i]
    model, RMSE, BaselineRMSE = create_and_test_model(feature_set, prices, test_size, seed)
    print("The RMSE for the split {} is {}".format(test_size, RMSE))
    improvement = (BaselineRMSE - RMSE) * 100 / BaselineRMSE 
    print("This model performs {:.2f}% better than the baseline model".format(improvement))
    print("\n")

The RMSE for the split 0.1 is 31899.344188135525
This model performs 59.70% better than the baseline model


The RMSE for the split 0.2 is 34524.61572284954
This model performs 55.12% better than the baseline model


The RMSE for the split 0.3 is 41071.36510190622
This model performs 45.40% better than the baseline model


The RMSE for the split 0.4 is 38010.6865852632
This model performs 54.15% better than the baseline model


The RMSE for the split 0.5 is 42517.34137133242
This model performs 47.46% better than the baseline model




## K - Nearest Neighbors Model

In [52]:
# Add target variable column 'After1970' to dataframe 
# Specifies if house was built after 1970 (1) or before (0)
housing_data['After1970'] = (housing_data['YearBuilt'] > 1970).astype(int)

In [53]:
# Calculate the baseline for KNN
baseline = housing_data['After1970'].describe()['mean']
print('KNN Baseline Value:', baseline)

KNN Baseline Value: 0.530542210021


In [54]:
def create_and_test_KNN_model(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = KNeighborsClassifier(n_neighbors=5)
    model = model.fit(X=X_train, y=y_train)
    predictions = model.predict(X=X_test)
    confusion = confusion_matrix(y_test, predictions)
    
    return model, confusion

In [55]:
# Identify the relevant features

# Inspected all valid numerical features (YearBuilt is excluded) for correlation with After1970
# Retained features with high correlation in KNN_relevant_features

valid_col.remove('YearBuilt')
valid_col.append('After1970')
X = pd.DataFrame(housing_data, columns = valid_col)
corr = np.corrcoef(X, rowvar=False)

after1970_corr_coef = corr[-1][:-1]
print(after1970_corr_coef)

significant_col = []
significant_col_values = []
sig_threshold = 0.5 # arbitrarily setting it at >=0.5
print("Below is the correlation coefficients for SalePrice with each of the other numerical features:\n")
for i in range(len(after1970_corr_coef)):
    corr_coef = after1970_corr_coef[i]
    print("{}. {}: {}".format(i+1, valid_col[i], corr_coef))
    if abs(corr_coef) >= sig_threshold:
        significant_col.append(valid_col[i])
        significant_col_values.append(corr_coef)

print("\nBelow are the columns (and correlation coefficients) that have been deemed significant (absolute coefficient >= 0.5) and will be used " +
      "in the linear regression model")

for i in range(len(significant_col)):
    print("{}. {}: {}".format(i+1, significant_col[i], significant_col_values[i]))
    
KNN_feature_set = housing_data.filter(significant_col)



# corr = np.corrcoef(X,rowvar=False) #rowvar = False b/c columns are features

KNN_relevant_features = ['YearRemodAdd', 'OverallQual', 'GarageCars','FullBath']

# Normalize relevant features to between 0 and 1 for KNN distance calculation
#KNN_feature_set = housing_data.filter(KNN_relevant_features)
KNN_feature_set['YearRemodAdd'] = (KNN_feature_set['YearRemodAdd'] - KNN_feature_set['YearRemodAdd'].min())/(KNN_feature_set['YearRemodAdd'].max() - KNN_feature_set['YearRemodAdd'].min())
KNN_feature_set['OverallQual'] = (KNN_feature_set['OverallQual'] - KNN_feature_set['OverallQual'].min())/(KNN_feature_set['OverallQual'].max() - KNN_feature_set['OverallQual'].min())
KNN_feature_set['GarageCars'] = (KNN_feature_set['GarageCars'] - KNN_feature_set['GarageCars'].min())/(KNN_feature_set['GarageCars'].max() - KNN_feature_set['GarageCars'].min())
KNN_feature_set['FullBath'] = (KNN_feature_set['FullBath'] - KNN_feature_set['FullBath'].min())/(KNN_feature_set['FullBath'].max() - KNN_feature_set['FullBath'].min())

After1970 = housing_data.filter(['After1970'])

[ 0.14715621 -0.00746384 -0.03101074  0.57309214 -0.30223309  0.59491449
  0.20601538 -0.11157267  0.13269904  0.30913261  0.1951212   0.14708876
 -0.12117517  0.25431315  0.13801007 -0.06082107  0.5114557   0.26401224
 -0.03629578 -0.11899307  0.14417634  0.13272778  0.49871043  0.42287349
  0.24892266  0.19849122 -0.27556487  0.01837421 -0.07161011  0.02192539
 -0.04267713 -0.00146904 -0.00809406  0.48378875]
Below is the correlation coefficients for SalePrice with each of the other numerical features:

1. MSSubClass: 0.14715621065093162
2. LotFrontage: -0.007463844971474709
3. LotArea: -0.031010744583879948
4. OverallQual: 0.5730921439851868
5. OverallCond: -0.3022330914318072
6. YearRemodAdd: 0.5949144882373884
7. BsmtFinSF1: 0.20601538219421847
8. BsmtFinSF2: -0.11157267269857586
9. BsmtUnfSF: 0.1326990420272374
10. TotalBsmtSF: 0.30913261386603597
11. 1stFlrSF: 0.19512120455917478
12. 2ndFlrSF: 0.14708876055601192
13. LowQualFinSF: -0.12117516554286616
14. GrLivArea: 0.2543131513

In [56]:
# Use same splits as in linear regression above
# Ravel converts column to 1_D array (avoids a data type conversion warning)

for i in range(0, 5):
    test_size = splits[i] 
    seed = random_states[i]
    model, confusion = create_and_test_KNN_model(KNN_feature_set, np.ravel(After1970), test_size, seed)
    print("Confusion matrix for split {} is: \n {}".format(test_size, confusion))
    TP = confusion[0,0]
    FN = confusion[1,0]
    FP = confusion[0,1]
    TN = confusion[1,1]
    accuracy = (TP + TN)/(TP + FN + FP + TN)
    improvement = (accuracy - baseline) * 100 / baseline 
    print("This model performs {:.2f}% better than the baseline model".format(improvement))
    print("\n")
    

Confusion matrix for split 0.1 is: 
 [[47 16]
 [18 65]]
This model performs 44.59% better than the baseline model


Confusion matrix for split 0.2 is: 
 [[ 96  28]
 [ 44 124]]
This model performs 42.01% better than the baseline model


Confusion matrix for split 0.3 is: 
 [[147  60]
 [ 56 175]]
This model performs 38.57% better than the baseline model


Confusion matrix for split 0.4 is: 
 [[196  83]
 [ 74 230]]
This model performs 37.73% better than the baseline model


Confusion matrix for split 0.5 is: 
 [[270  71]
 [ 89 299]]
This model performs 47.12% better than the baseline model


