In [682]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, preprocessing, svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from collections import Counter
import random

In [683]:
#import cleaned dataset from last project - https://github.com/SarahNadeau/datascience-projectB/blob/master/cleaned.csv
data = pd.DataFrame.from_csv('cleaned.csv')

In [684]:
# Decision Tree Model
# Take a look at the target variable distribution and calculate baseline
c = Counter(data['Neighborhood'])
num_col = len(c)
baseline = max(c.values()) / sum(c.values())
print(c)
print("Number of Neighborhood types: " + str(num_col))
print("Baseline accuracy: " + str(baseline))

Counter({'NAmes': 225, 'CollgCr': 149, 'OldTown': 113, 'Edwards': 100, 'Somerst': 86, 'Gilbert': 79, 'NridgHt': 76, 'Sawyer': 74, 'NWAmes': 73, 'SawyerW': 59, 'BrkSide': 58, 'Crawfor': 51, 'Mitchel': 49, 'NoRidge': 41, 'IDOTRR': 37, 'Timber': 37, 'ClearCr': 28, 'StoneBr': 25, 'SWISU': 25, 'MeadowV': 17, 'Blmngtn': 17, 'BrDale': 16, 'Veenker': 11, 'NPkVill': 9, 'Blueste': 2})
Number of Neighborhood types: 25
Baseline accuracy: 0.15442690459849004


In [685]:
# Encode the dataset with labels
enc = LabelEncoder()

for i in data.columns:
    data[i] = enc.fit_transform(data[i])

In [686]:
# Store target variable (Neighborhood) as y
y = data['Neighborhood']
X = data.drop('Neighborhood', axis=1)

In [687]:
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [42, 1337, 420, 90210, 24]

In [688]:
def create_and_test_decision_tree(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = DecisionTreeClassifier(max_leaf_nodes=100)
    model = model.fit(X=X_train, y=y_train)
    with open("decisiontree.txt", 'w') as f:
        export_graphviz(model, out_file=f, feature_names=list(X))
    tree_pred = model.predict(X_test)
    score = accuracy_score(y_true = y_test, y_pred = tree_pred)
    percent_imprv = 100 * (score - baseline) / baseline 
    return model, score, percent_imprv

In [689]:
# Create and test Decision Tree Classifierfor 5 different train_test splits
for i in range(5):
    test_size = splits[i]
    seed = random_states[i]
    model, score, imprv = create_and_test_decision_tree(X, y, test_size, seed)
    print("The decision tree for the train-test-split of {}-{} has an accuracy score of:\n{}"
          .format(1-test_size, test_size, score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model".format(imprv))
    print("\n")

The decision tree for the train-test-split of 0.9-0.1 has an accuracy score of:
0.5958904109589042
Thus, this model is 285.87% more accurate than the baseline model


The decision tree for the train-test-split of 0.8-0.2 has an accuracy score of:
0.547945205479452
Thus, this model is 254.82% more accurate than the baseline model


The decision tree for the train-test-split of 0.7-0.3 has an accuracy score of:
0.593607305936073
Thus, this model is 284.39% more accurate than the baseline model


The decision tree for the train-test-split of 0.6-0.4 has an accuracy score of:
0.4734133790737564
Thus, this model is 206.56% more accurate than the baseline model


The decision tree for the train-test-split of 0.5-0.5 has an accuracy score of:
0.5116598079561042
Thus, this model is 231.33% more accurate than the baseline model




# SVM Model

In [690]:
# Store the target variable (HouseStyle) as Y
Y = data['HouseStyle']

In [691]:
# Calculate the baseline for SVM
SVM_baseline = max(Y.value_counts())/len(Y)
print('SVM baseline value:', SVM_baseline)

SVM baseline value: 0.497597803706


In [692]:
# Perform feature engineering to determine features to include
X = pd.DataFrame()

# Assign features with strong correlation to target variable to feature set X 
# Correlation cutoff arbitrarily set to .4
for column in data.columns.values:
    if (abs(data[column].corr(data['HouseStyle'])) > .4 and column != 'HouseStyle'):
        print(column + ':', data[column].corr(data['HouseStyle']))
        X[column] = data[column]

MSSubClass: 0.445623148876
2ndFlrSF: 0.510942821246
HalfBath: 0.413121689392


In [693]:
# Implement SVM model with default settings (RBF kerenel) 
model = svm.SVC()

In [694]:
for i in range(0,5):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=splits[i], random_state=random_states[i])
    model.fit(X_train, Y_train)
    model.predict(X_test)
    score = model.score(X_test, Y_test)
    improvement = 100*(score - SVM_baseline)/SVM_baseline
    print("The SVM for a train-test-split of {}-{} has an accuracy score of:\n{:.4f}"
          .format(1-splits[i], splits[i], score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model."
          .format(improvement))
    print("\n")

The SVM for a train-test-split of 0.9-0.1 has an accuracy score of:
0.9247
Thus, this model is 85.82% more accurate than the baseline model.


The SVM for a train-test-split of 0.8-0.2 has an accuracy score of:
0.9281
Thus, this model is 86.51% more accurate than the baseline model.


The SVM for a train-test-split of 0.7-0.3 has an accuracy score of:
0.9087
Thus, this model is 82.61% more accurate than the baseline model.


The SVM for a train-test-split of 0.6-0.4 has an accuracy score of:
0.9057
Thus, this model is 82.01% more accurate than the baseline model.


The SVM for a train-test-split of 0.5-0.5 has an accuracy score of:
0.8930
Thus, this model is 79.46% more accurate than the baseline model.




# Logistic Regression Model

In [695]:
#import cleaned dataset from last project
data = pd.DataFrame.from_csv('cleaned.csv')

In [696]:
# Add target variable column 'qualAboveAverage' to dataframe 
# Specifies if house quality is above average (1) or below average(0)
Y = (data['OverallQual'] > 5).astype(int)

X = pd.DataFrame()
logit_baseline = max(Y.value_counts())/len(Y)
print('Logistic Regression Baseline Value:', logit_baseline)

Logistic Regression Baseline Value: 0.631434454358


In [697]:
print(Counter(data['BsmtQual']))
enc = LabelEncoder()
for i in data.columns:
    data[i] = enc.fit_transform(data[i])
data = data.drop("OverallQual",axis=1)
print(Counter(data['BsmtQual']))

for column in data.columns.values:
    corr = abs(data[column].corr(Y))
    if corr > .4:
        print(column + ':', corr)
        X[column] = data[column]

Counter({'TA': 649, 'Gd': 615, 'Ex': 121, 'NP': 37, 'Fa': 35})
Counter({4: 649, 2: 615, 0: 121, 3: 37, 1: 35})
YearBuilt: 0.493020979675
YearRemodAdd: 0.437861757046
ExterQual: 0.415983469238
BsmtQual: 0.453880347052
GrLivArea: 0.490761427974
FullBath: 0.487962875878
GarageCars: 0.469083851474
SalePrice: 0.636310119244


In [698]:
model = LogisticRegression()

In [699]:
for i in range(0,5):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=splits[i], random_state=random_states[i])
    model.fit(X_train, Y_train)
    score = model.score(X_test, Y_test)
    # Calculate % improvement over the baseline
    improvement = 100*(score - logit_baseline)/logit_baseline
    print("The Logistic Regression for a train-test-split of {}-{} has an accuracy score of:\n{:.4f}"
          .format(1-splits[i], splits[i], score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model."
          .format(improvement))
    print("\n")

The Logistic Regression for a train-test-split of 0.9-0.1 has an accuracy score of:
0.8699
Thus, this model is 37.76% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.8-0.2 has an accuracy score of:
0.8527
Thus, this model is 35.05% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.7-0.3 has an accuracy score of:
0.8425
Thus, this model is 33.42% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.6-0.4 has an accuracy score of:
0.8439
Thus, this model is 33.65% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.5-0.5 has an accuracy score of:
0.8326
Thus, this model is 31.87% more accurate than the baseline model.


