In [1]:
import seaborn as sns
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, preprocessing, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from collections import Counter
import random

In [2]:
#import cleaned dataset from last project
data = pd.DataFrame.from_csv('cleaned.csv')

In [3]:
# Decision Tree Model
# Take a look at the target variable distribution and calculate baseline
c = Counter(data['Neighborhood'])
num_col = len(c)
baseline = max(c.values()) / sum(c.values())
print(c)
print("Number of Neighborhood types: " + str(num_col))
print("Baseline accuracy: " + str(baseline))

Counter({'NAmes': 225, 'CollgCr': 149, 'OldTown': 113, 'Edwards': 100, 'Somerst': 86, 'Gilbert': 79, 'NridgHt': 76, 'Sawyer': 74, 'NWAmes': 73, 'SawyerW': 59, 'BrkSide': 58, 'Crawfor': 51, 'Mitchel': 49, 'NoRidge': 41, 'IDOTRR': 37, 'Timber': 37, 'ClearCr': 28, 'StoneBr': 25, 'SWISU': 25, 'MeadowV': 17, 'Blmngtn': 17, 'BrDale': 16, 'Veenker': 11, 'NPkVill': 9, 'Blueste': 2})
Number of Neighborhood types: 25
Baseline accuracy: 0.15442690459849004


In [4]:
# Encode the dataset with labels
enc = LabelEncoder()

for i in data.columns:
    data[i] = enc.fit_transform(data[i])

In [5]:
# Store target variable (Neighborhood) as y
y = data['Neighborhood']

# Limit the feature set to intuitively relevant features 
# Houses in the same neighborhood were likely built in the same period, have similar prices, 
# are within the same zone, etc
#relevant = ['MSSubClass', 'YearBuilt', 'LotArea', 'MSZoning', 'LotFrontage', 'LotArea', 'Street' 'Sale', 'BldgType']
#X = data.filter(relevant)
#X = data.drop(['Neighborhood', 'Utilities', 'LandSlope', 'Condition1', 'Condition2'], axis=1)
X = data.drop('Neighborhood', axis=1)
print(X)

      MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
Id                                                                          
1              5         3           37      327       1      1         3   
2              0         3           52      498       1      1         3   
3              5         3           40      700       1      1         0   
4              6         3           32      489       1      1         0   
5              5         3           56      922       1      1         0   
6              4         3           57      912       1      1         0   
7              0         3           47      551       1      1         3   
8              5         3            0      592       1      1         0   
9              4         4           23      138       1      1         3   
10            14         3           22      222       1      1         3   
11             0         3           42      689       1      1         3   

In [6]:
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [42, 1337, 420, 90210, 24]

In [7]:
def create_and_test_decision_tree(dataset, target, test_size, seed):
    X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size = test_size, random_state=seed)
    model = DecisionTreeClassifier(max_leaf_nodes=100)
    model = model.fit(X=X_train, y=y_train)
    with open("decisiontree.txt", 'w') as f:
        export_graphviz(model, out_file=f, feature_names=list(X))
    tree_pred = model.predict(X_test)
    score = accuracy_score(y_true = y_test, y_pred = tree_pred)
    percent_imprv = 100 * (score - baseline) / baseline 
    return model, score, percent_imprv

In [8]:
# Create and test Decision Tree Classifierfor 5 different train_test splits
for i in range(5):
    test_size = splits[i]
    seed = random_states[i]
    model, score, imprv = create_and_test_decision_tree(X, y, test_size, seed)
    print("The decision tree for the train-test-split of {}-{} has an accuracy score of:\n{}"
          .format(1-test_size, test_size, score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model".format(imprv))
    print("\n")

NameError: name 'prices' is not defined

# SVM Model

In [9]:
# Store the target variable (HouseStyle) as Y
Y = data['HouseStyle']

In [10]:
# Calculate the baseline for SVM
SVM_baseline = max(Y.value_counts())/len(Y)
print('SVM baseline value:', SVM_baseline)

SVM baseline value: 0.497597803706


In [11]:
# Perform feature engineering to determine features to include
X = pd.DataFrame()

# Assign features with strong correlation to target variable to feature set X 
# Correlation cutoff arbitrarily set to .4
for column in data.columns.values:
    if (data[column].corr(data['HouseStyle']) > .4 and column != 'HouseStyle'):
        print(column + ':', data[column].corr(data['HouseStyle']))
        X[column] = data[column]

MSSubClass: 0.445623148876
2ndFlrSF: 0.510942821246
HalfBath: 0.413121689392


In [14]:
# Implement SVM model with linear kernel
# model = svm.SVC(kernel='linear', gamma=1)

# Implement SVM model with default settings (RBF kerenel) 
model = svm.SVC()

In [15]:
# Define training/test splits and random states to generate them
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [42, 1337, 420, 90210, 24]

for i in range(0,5):
    # Split dataset into training and test data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=splits[i], random_state=random_states[i])
    # Fit the model to the training data
    model.fit(X_train, Y_train)
    # Predict on the testing data
    model.predict(X_test)
    # Test the score of the model
    score = model.score(X_test, Y_test)
    # Calculate % improvement over the baseline
    improvement = 100*(score - SVM_baseline)/SVM_baseline
    print("The SVM for a train-test-split of {}-{} has an accuracy score of:\n{:.4f}"
          .format(1-splits[i], splits[i], score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model."
          .format(improvement))
    print("\n")

The SVM for a train-test-split of 0.9-0.1 has an accuracy score of:
0.9247
Thus, this model is 85.82% more accurate than the baseline model.


The SVM for a train-test-split of 0.8-0.2 has an accuracy score of:
0.9281
Thus, this model is 86.51% more accurate than the baseline model.


The SVM for a train-test-split of 0.7-0.3 has an accuracy score of:
0.9087
Thus, this model is 82.61% more accurate than the baseline model.


The SVM for a train-test-split of 0.6-0.4 has an accuracy score of:
0.9057
Thus, this model is 82.01% more accurate than the baseline model.


The SVM for a train-test-split of 0.5-0.5 has an accuracy score of:
0.8930
Thus, this model is 79.46% more accurate than the baseline model.




For our SVM model, we identified potential feature columns that correlated well with the target variable, HouseStyle. Correlation thresholds of .2 (10 features) and .4 (3 features) were tried. The model performed very poorly with a correlation threshold of 0.2 (about -2% to 10% improvement over the baseline), while acceptable results were generated using a threshold of 0.4. Additionally, both linear and RBF kernels were tried for the SVM model. Results improve from about 69% to 76% improvement using a linear kernel to about 89% to 92% improvement using an RBF kernel. 