In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [None]:
# import dataset and CentralAir
houseData = pd.read_csv('train.csv')
Central = pd.DataFrame(houseData['CentralAir'])

In [None]:
# problem 1
# import SalePrice
Sale = pd.DataFrame(houseData['SalePrice'])

In [None]:
# problem 1 part a
# Plot the distribution
sb.catplot(y = "CentralAir", data = Central, kind = "count")
Central["CentralAir"].value_counts()

In [None]:
# problem 1 part b
# Create a joint dataframe by concatenating SalePrice and CentralAir
CentralSale = pd.concat([Sale, Central], axis = 1).reindex(Sale.index)

# Joint Boxplot of SalePrice against CentralAir
f = plt.figure(figsize=(18, 6))
sb.boxplot(x = "SalePrice", y = "CentralAir", data = CentralSale, orient = "h")

In [None]:
# problem 1 part c
from sklearn.tree import DecisionTreeClassifier

In [None]:
# problem 1 part d
# Train Set : 1100 samples
Central_train = pd.DataFrame(Central[:1100])
Sale_train = pd.DataFrame(Sale[:1100])

# Test Set : 360 samples
Central_test = pd.DataFrame(Central[-360:])
Sale_test = pd.DataFrame(Sale[-360:])

# Check the sample sizes
print("Train Set :", Central_train.shape, Sale_train.shape)
print("Test Set  :", Central_test.shape, Sale_test.shape)

In [None]:
# problem 1 part e
# Train Dataset to predict the class
dectree = DecisionTreeClassifier(max_depth = 2)
dectree.fit(Sale_train, Central_train)

In [None]:
# problem 1 part f
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=["SalePrice"], 
          class_names=["No","Yes"])

In [None]:
# problem 1 part g
# Predict CentralAir corresponding to SalePrice Train
Central_train_pred = dectree.predict(Sale_train)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 1 part h
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(Sale_train, Central_train))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_train, Central_train_pred).flatten()/len(Central_train)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 1 part i
# Predict CentralAir corresponding to SalePrice Test
Central_test_pred = dectree.predict(Sale_test)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 1 part j
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(Sale_test, Central_test))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_test, Central_test_pred).flatten()/len(Central_test)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 2 for GrLivArea
# import GrLivArea
GrLiv = pd.DataFrame(houseData['GrLivArea'])

In [None]:
# problem 2 for GrLivArea part a
# Plot the distribution
sb.catplot(y = "CentralAir", data = Central, kind = "count")
Central["CentralAir"].value_counts()

In [None]:
# problem 2 for GrLivArea part b
# Create a joint dataframe by concatenating GrLivArea and CentralAir
CentralGrLiv = pd.concat([GrLiv, Central], axis = 1).reindex(GrLiv.index)

# Joint Boxplot of GrLivArea against CentralAir
f = plt.figure(figsize=(18, 6))
sb.boxplot(x = "GrLivArea", y = "CentralAir", data = CentralGrLiv, orient = "h")

In [None]:
# problem 2 for GrLivArea part c
from sklearn.tree import DecisionTreeClassifier

In [None]:
# problem 2 for GrLivArea part d
# Train Set : 1100 samples
GrLiv_train = pd.DataFrame(GrLiv[:1100])

# Test Set : 360 samples
GrLiv_test = pd.DataFrame(GrLiv[-360:])

# Check the sample sizes
print("Train Set :", Central_train.shape, GrLiv_train.shape)
print("Test Set  :", Central_test.shape, GrLiv_test.shape)

In [None]:
# problem 2 for GrLivArea part e
# Train Dataset to predict the class
dectree = DecisionTreeClassifier(max_depth = 2)
dectree.fit(GrLiv_train, Central_train)

In [None]:
# problem 2 for GrLivArea part f
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=["GrLivArea"], 
          class_names=["No","Yes"])

In [None]:
# problem 2 for GrLivArea part g 
# Predict CentralAir corresponding to GrLivArea Train
Central_train_pred = dectree.predict(GrLiv_train)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 2 for GrLivArea part h
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(GrLiv_train, Central_train))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_train, Central_train_pred).flatten()/len(Central_train)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 2 for GrLivArea part i
# Predict CentralAir corresponding to GrLivArea Test
Central_test_pred = dectree.predict(GrLiv_test)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 2 for GrLivArea part j
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(GrLiv_test, Central_test))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_test, Central_test_pred).flatten()/len(Central_test)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 2 for OverallQual
# import OverallQual
Overall = pd.DataFrame(houseData['OverallQual'])

In [None]:
# problem 2 for OverallQual part a
# Plot the distribution
sb.catplot(y = "CentralAir", data = Central, kind = "count")
Central["CentralAir"].value_counts()

In [None]:
# problem 2 for OverallQual part b
# Create a joint dataframe by concatenating OverallQual and CentralAir
CentralOverall = pd.concat([Overall, Central], axis = 1).reindex(Overall.index)

# Joint Boxplot of OverallQual against CentralAir
f = plt.figure(figsize=(18, 6))
sb.boxplot(x = "OverallQual", y = "CentralAir", data = CentralOverall, orient = "h")

In [None]:
# problem 2 for OverallQual part c
from sklearn.tree import DecisionTreeClassifier

In [None]:
# problem 2 for OverallQual part d
# Train Set : 1100 samples
Overall_train = pd.DataFrame(Overall[:1100])

# Test Set : 360 samples
Overall_test = pd.DataFrame(Overall[-360:])

# Check the sample sizes
print("Train Set :", Central_train.shape, Overall_train.shape)
print("Test Set  :", Central_test.shape, Overall_test.shape)

In [None]:
# problem 2 for OverallQual part e
# Train Dataset to predict the class
dectree = DecisionTreeClassifier(max_depth = 2)
dectree.fit(Overall_train, Central_train)

In [None]:
# problem 2 for OverallQual part f
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=["OverallQual"], 
          class_names=["No","Yes"])

In [None]:
# problem 2 for OverallQual part g
# Predict CentralAir corresponding to OverallQual Train
Central_train_pred = dectree.predict(Overall_train)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 2 for OverallQual part h
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(Overall_train, Central_train))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_train, Central_train_pred).flatten()/len(Central_train)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 2 for OverallQual part i
# Predict CentralAir corresponding to OverallQual Test
Central_test_pred = dectree.predict(Overall_test)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 2 for OverallQual part j
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(Overall_test, Central_test))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_test, Central_test_pred).flatten()/len(Central_test)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 2 for YearBuilt
# import YearBuilt
Year = pd.DataFrame(houseData['YearBuilt'])

In [None]:
# problem 2 for YearBuilt part a
# Plot the distribution
sb.catplot(y = "CentralAir", data = Central, kind = "count")
Central["CentralAir"].value_counts()

In [None]:
# problem 2 for YearBuilt part b
# Create a joint dataframe by concatenating YearBuilt and CentralAir
CentralYear = pd.concat([Year, Central], axis = 1).reindex(Year.index)

# Joint Boxplot of OverallQual against CentralAir
f = plt.figure(figsize=(18, 6))
sb.boxplot(x = "YearBuilt", y = "CentralAir", data = CentralYear, orient = "h")

In [None]:
# problem 2 for YearBuilt part c
from sklearn.tree import DecisionTreeClassifier

In [None]:
# problem 2 for YearBuilt part d
# Train Set : 1100 samples
Year_train = pd.DataFrame(Year[:1100])

# Test Set : 360 samples
Year_test = pd.DataFrame(Year[-360:])

# Check the sample sizes
print("Train Set :", Central_train.shape, Year_train.shape)
print("Test Set  :", Central_test.shape, Year_test.shape)

In [None]:
# problem 2 for YearBuilt part e
# Train Dataset to predict the class
dectree = DecisionTreeClassifier(max_depth = 2)
dectree.fit(Year_train, Central_train)

In [None]:
# problem 2 for YearBuilt part f
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=["YearBuilt"], 
          class_names=["No","Yes"])

In [None]:
# problem 2 for YearBuilt part g 
# Predict CentralAir corresponding to GrLivArea Train
Central_train_pred = dectree.predict(Year_train)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 2 for YearBuilt part h
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(Year_train, Central_train))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_train, Central_train_pred).flatten()/len(Central_train)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_train, Central_train_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 2 for YearBuilt part i
# Predict CentralAir corresponding to YearBuilt Test
Central_test_pred = dectree.predict(Year_test)

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# problem 2 for YearBuilt part j
# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(Year_test, Central_test))
group_names = ['True Neg','False Pos', 'False Neg', 'True Pos']
group_percentages = ["{0:.2%}".format(value) for value in
                     confusion_matrix(Central_test, Central_test_pred).flatten()/len(Central_test)]
labels = [f"{v1}\n{v2}" for v1, v2 in
          zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plot the two-way Confusion Matrix with accuravcy
sb.heatmap(confusion_matrix(Central_test, Central_test_pred), 
           annot = labels, fmt="", annot_kws={"size": 18})

In [None]:
# problem 3
print("Classification Accuracy for SalePrice_train \t:", "0.94")
print("Classification Accuracy for SalePrice_test \t:", "0.9472222222222222")
print("Classification Accuracy for GrLivArea_train \t:", "0.9354545454545454")
print("Classification Accuracy for GrLivArea_test \t:", "0.9444444444444444")
print("Classification Accuracy for OverallQual_train \t:", "0.9390909090909091")
print("Classification Accuracy for OverallQual_test \t:", "0.9416666666666667")
print("Classification Accuracy for YearBuilt_train \t:", "0.9327272727272727")
print("Classification Accuracy for YearBuilt_test \t:", "0.9416666666666667")
print("From the accuracies, the best variables to predict CentralAir is SalePrice.")
print("This is because the SalePrice's accuracy scores are the highest for both train data and test data among other variables.")