In [171]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, preprocessing, svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from collections import Counter
import random

In [172]:
#import cleaned dataset from last project - https://github.com/SarahNadeau/datascience-projectB/blob/master/cleaned.csv
data = pd.DataFrame.from_csv('cleaned.csv')

In [173]:
# Decision Tree Model
# Take a look at the target variable distribution and calculate baseline
c = Counter(data['Neighborhood'])
num_col = len(c)
baseline = max(c.values()) / sum(c.values())
print(c)
print("Number of Neighborhood types: " + str(num_col))
print("Baseline accuracy: " + str(baseline))

Counter({'NAmes': 225, 'CollgCr': 149, 'OldTown': 113, 'Edwards': 100, 'Somerst': 86, 'Gilbert': 79, 'NridgHt': 76, 'Sawyer': 74, 'NWAmes': 73, 'SawyerW': 59, 'BrkSide': 58, 'Crawfor': 51, 'Mitchel': 49, 'NoRidge': 41, 'IDOTRR': 37, 'Timber': 37, 'ClearCr': 28, 'StoneBr': 25, 'SWISU': 25, 'MeadowV': 17, 'Blmngtn': 17, 'BrDale': 16, 'Veenker': 11, 'NPkVill': 9, 'Blueste': 2})
Number of Neighborhood types: 25
Baseline accuracy: 0.15442690459849004


In [174]:
# Encode the dataset with labels
enc = LabelEncoder()

for i in data.columns:
    data[i] = enc.fit_transform(data[i])

In [175]:
# Store target variable (Neighborhood) as y
y = data['Neighborhood']
X = data.drop('Neighborhood', axis=1)

In [176]:
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [421, 1337, 420, 90210, 4]

In [177]:
def create_and_test_decision_tree(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = DecisionTreeClassifier(max_leaf_nodes=100)
    model = model.fit(X=X_train, y=y_train)
    with open("decisiontree.txt", 'w') as f:
        export_graphviz(model, out_file=f, feature_names=list(X))
    tree_pred = model.predict(X_test)
    score = accuracy_score(y_true = y_test, y_pred = tree_pred)
    percent_imprv = 100 * (score - baseline) / baseline 
    return model, score, percent_imprv

In [178]:
# Create and test Decision Tree Classifierfor 5 different train_test splits
for i in range(5):
    test_size = splits[i]
    seed = random_states[i]
    model, score, imprv = create_and_test_decision_tree(X, y, test_size, seed)
    print("The decision tree for the train-test-split of {}-{} has an accuracy score of:\n{}"
          .format(1-test_size, test_size, score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model".format(imprv))
    print("\n")

The decision tree for the train-test-split of 0.9-0.1 has an accuracy score of:
0.547945205479452
Thus, this model is 254.82% more accurate than the baseline model


The decision tree for the train-test-split of 0.8-0.2 has an accuracy score of:
0.5547945205479452
Thus, this model is 259.26% more accurate than the baseline model


The decision tree for the train-test-split of 0.7-0.3 has an accuracy score of:
0.58675799086758
Thus, this model is 279.96% more accurate than the baseline model


The decision tree for the train-test-split of 0.6-0.4 has an accuracy score of:
0.48027444253859347
Thus, this model is 211.00% more accurate than the baseline model


The decision tree for the train-test-split of 0.5-0.5 has an accuracy score of:
0.48148148148148145
Thus, this model is 211.79% more accurate than the baseline model




# SVM Model

In [179]:
# Store the target variable (HouseStyle) as Y - and encode its labels
data = pd.DataFrame.from_csv('cleaned.csv')

Y = data['HouseStyle']

encodings = {'1Story': 1, '1.5Unf': 2, '1.5Fin': 3, '2Story': 4, '2.5Unf': 5, '2.5Fin': 6, 'SFoyer': 7, 'SLvl': 8}
for key in encodings:
    Y = Y.replace(key, encodings[key])

In [180]:
# Perform feature engineering to determine features to include
X = pd.DataFrame()

data = pd.DataFrame.from_csv('cleaned.csv')

columns = data.columns
column_dtypes = data.dtypes
numerical_types = [np.int64, np.int32, np.float32, np.float64]
valid_col = [columns[i] for i in range(len(columns)) if column_dtypes[i] in numerical_types]
valid_data = pd.DataFrame(data, columns = valid_col)

In [181]:
# Calculate the baseline for SVM
SVM_baseline = max(Y.value_counts())/len(Y)
print('SVM baseline value:', SVM_baseline)

SVM baseline value: 0.497597803706


In [182]:
# Assign features with strong correlation to target variable to feature set X 
# Correlation cutoff arbitrarily set to .4

for column in valid_data.columns.values:
    corr = abs(valid_data[column].corr(Y))
    if corr > .4:
        print(column + ':', corr)
        X[column] = valid_data[column]

MSSubClass: 0.47217525491
2ndFlrSF: 0.546533790195


In [183]:
# Implement SVM model with default settings (RBF kerenel) 
model = svm.SVC()

In [184]:
for i in range(0,5):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=splits[i], random_state=random_states[i])
    model.fit(X_train, Y_train)
    model.predict(X_test)
    score = model.score(X_test, Y_test)
    improvement = 100*(score - SVM_baseline)/SVM_baseline
    print("The SVM for a train-test-split of {}-{} has an accuracy score of:\n{:.4f}"
          .format(1-splits[i], splits[i], score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model."
          .format(improvement))
    print("\n")

The SVM for a train-test-split of 0.9-0.1 has an accuracy score of:
0.8973
Thus, this model is 80.32% more accurate than the baseline model.


The SVM for a train-test-split of 0.8-0.2 has an accuracy score of:
0.9110
Thus, this model is 83.07% more accurate than the baseline model.


The SVM for a train-test-split of 0.7-0.3 has an accuracy score of:
0.8904
Thus, this model is 78.94% more accurate than the baseline model.


The SVM for a train-test-split of 0.6-0.4 has an accuracy score of:
0.8834
Thus, this model is 77.53% more accurate than the baseline model.


The SVM for a train-test-split of 0.5-0.5 has an accuracy score of:
0.8765
Thus, this model is 76.15% more accurate than the baseline model.




# Logistic Regression Model

In [185]:
#import cleaned dataset from last project
data = pd.DataFrame.from_csv('cleaned.csv')

BsmtQual = data['BsmtQual']
encodings = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NP' :1}
for key in encodings:
    BsmtQual = BsmtQual.replace(key, encodings[key])
data['BsmtQual'] = BsmtQual
    
ExterQual = data['ExterQual']
encodings = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NP' :1}
for key in encodings:
    ExterQual = ExterQual.replace(key, encodings[key])
data['ExterQual'] = ExterQual
    
ExterCond = data['ExterCond']
encodings = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NP' :1}
for key in encodings:
    ExterCond = ExterCond.replace(key, encodings[key])
data['ExterCond'] = ExterCond
    
BsmtCond = data['BsmtCond']
encodings = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NP' :1}
for key in encodings:
    BsmtCond = BsmtCond.replace(key, encodings[key])
data['BsmtCond'] = BsmtCond
    
KitchenQual = data['KitchenQual']
encodings = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NP' :1}
for key in encodings:
    KitchenQual = KitchenQual.replace(key, encodings[key])
data['KitchenQual'] = KitchenQual
    
GarageQual = data['GarageQual']
encodings = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NP' :1}
for key in encodings:
    GarageQual = GarageQual.replace(key, encodings[key])
data['GarageQual'] = GarageQual

encoded = ['BsmtQual', 'ExterQual', 'ExterCond', 'BsmtCond', 'KitchenQual', 'GarageQual']

In [186]:
# Store target variable (qualAboveAverage) as y 
# Specifies if house quality is above agerage (1) or below average(0)
Y = (data['OverallQual'] > 5).astype(int)

columns = data.columns 
columns_dtypes = data.dtypes
numerical_types = [np.int64, np.int32, np.float32, np.float64]
valid_col = [columns[i] for i in range(len(columns)) if columns_dtypes[i] in numerical_types]
valid_col.remove('OverallQual')
valid_data = pd.DataFrame(data, columns = valid_col)

enc = LabelEncoder()
for i in valid_data.columns:
    valid_data[i] = enc.fit_transform(valid_data[i])

In [187]:
for column in valid_data.columns.values:
    corr = abs(valid_data[column].corr(Y))
    if corr > .4:
        print(column + ':', corr)
        X[column] = valid_data[column]

YearBuilt: 0.493020979675
YearRemodAdd: 0.437861757046
ExterQual: 0.506235761975
BsmtQual: 0.521354932126
GrLivArea: 0.490761427974
FullBath: 0.487962875878
KitchenQual: 0.46337623216
GarageCars: 0.469083851474
SalePrice: 0.636310119244


In [188]:
# Calculate the baseline for Logistic Regression model
logit_baseline = max(Y.value_counts())/len(Y)
print('Logistic Regression Baseline Value:', logit_baseline)

Logistic Regression Baseline Value: 0.631434454358


In [189]:
# build model and fit on train set
model = LogisticRegression()

for i in range(0,5):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=splits[i], random_state=random_states[i])
    model.fit(X_train, Y_train)
    score = model.score(X_test, Y_test)
    improvement = 100*(score - logit_baseline)/logit_baseline
    print("The Logistic Regression for a train-test-split of {}-{} has an accuracy score of:\n{:.4f}"
          .format(1-splits[i], splits[i], score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model."
          .format(improvement))
    print("\n")

The Logistic Regression for a train-test-split of 0.9-0.1 has an accuracy score of:
0.8288
Thus, this model is 31.25% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.8-0.2 has an accuracy score of:
0.8425
Thus, this model is 33.42% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.7-0.3 has an accuracy score of:
0.8425
Thus, this model is 33.42% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.6-0.4 has an accuracy score of:
0.8508
Thus, this model is 34.74% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.5-0.5 has an accuracy score of:
0.8244
Thus, this model is 30.56% more accurate than the baseline model.


