In [9]:
# import necessary packages
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [10]:
#import cleaned dataset from last project
data = pd.DataFrame.from_csv('cleaned.csv')

# convert all features to categorical integer values
enc = LabelEncoder()
for i in data.columns:
    data[i] = enc.fit_transform(data[i])

In [11]:
# Add target variable column 'qualAboveAverage' to dataframe 
# Specifies if house quality is above agerage (1) or below average(0)
data['qualAboveAverage'] = (data['OverallQual'] > 5).astype(int)

In [12]:
# Store target variable (qualAboveAverage) as y
Y = data['qualAboveAverage']
        
# Calculate the baseline for Logistic Regression model
logit_baseline = max(Y.value_counts())/len(Y)
print('Logistic Regression Baseline Value:', logit_baseline)

Logistic Regression Baseline Value: 0.625257378174


In [19]:
#To choose appropriate features, correlation coeffient was used to choose columns. However, it did not yield  
#a desirable output. Therefore, the method below was used to find suitable features.
# Limit the feature set to intuitively relevant features 
#Houses that are newer are better quality; good neighborhood - better quality houses; better quality houses are 
#generally sold at higher prices.
relevant = ['YearRemodAdd', 'YearBuilt', 'MSZoning', 'Neighborhood','SalePrice', 'Condition1', 'Condition2']
X = data.filter(relevant)
X = data.drop(['OverallQual', 'qualAboveAverage', 'LandSlope', 'RoofStyle', 'Fireplaces'], axis=1)
print(X)


      MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \
Id                                                                          
1              5         3           37      327       1      1         3   
2              0         3           52      498       1      1         3   
3              5         3           40      700       1      1         0   
4              6         3           32      489       1      1         0   
5              5         3           56      922       1      1         0   
6              4         3           57      912       1      1         0   
7              0         3           47      551       1      1         3   
8              5         3            0      592       1      1         0   
9              4         4           23      138       1      1         3   
10            14         3           22      222       1      1         3   
11             0         3           42      689       1      1         3   

In [14]:
splits = [0.1, 0.2, 0.3, 0.4, 0.5]
random_states = [98, 1560, 999, 105002, 77393]

In [15]:
# build model and fit on train set
model = LogisticRegression()

for i in range(0,5):
    # Split dataset into training and test data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=splits[i], random_state=random_states[i])
    # Fit the model to the training data
    model.fit(X_train, Y_train)
    # Predict on the testing data
    model.predict(X_test)
    # Test the score of the model
    score = model.score(X_test, Y_test)
    # Calculate % improvement over the baseline
    improvement = 100*(score - logit_baseline)/logit_baseline
    print("The Logistic Regression for a train-test-split of {}-{} has an accuracy score of:\n{:.4f}"
          .format(1-splits[i], splits[i], score))
    print("Thus, this model is {:.2f}% more accurate than the baseline model."
          .format(improvement))
    print("\n")

The Logistic Regression for a train-test-split of 0.9-0.1 has an accuracy score of:
1.0000
Thus, this model is 59.93% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.8-0.2 has an accuracy score of:
0.9966
Thus, this model is 59.39% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.7-0.3 has an accuracy score of:
1.0000
Thus, this model is 59.93% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.6-0.4 has an accuracy score of:
0.9983
Thus, this model is 59.66% more accurate than the baseline model.


The Logistic Regression for a train-test-split of 0.5-0.5 has an accuracy score of:
0.9986
Thus, this model is 59.71% more accurate than the baseline model.


