In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
import pandas as pd
import kaggle
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from dataprep.eda import create_report, plot

In [None]:
def download_data():
    comp = kaggle.KaggleApi()
    comp.authenticate()
    comp.competition_download_files(competition="house-prices-advanced-regression-techniques")

In [None]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train

In [None]:
create_report(df=train)

In [None]:
df_experimental = pd.DataFrame()
df_experimental["MSSubClass"] = train["MSSubClass"]

In [None]:
values = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190]

for value in values:
    df_experimental.loc[df_experimental["MSSubClass"] == value, "MSSubClass"] = values.index(value)

In [None]:
df_experimental["MSSubClass"].unique()

In [None]:
plot(df_experimental, "MSSubClass")

In [None]:
df_experimental["MSZoning"] = train["MSZoning"]
df_experimental["SalePrice"] = train["SalePrice"]

In [None]:
plot(df_experimental, "MSZoning")

In [None]:
df_experimental["Residential"] = np.where((df_experimental["MSZoning"] == "RL") | (df_experimental["MSZoning"] == "RM"), 1, 0)
df_experimental = df_experimental.drop("MSZoning", axis=1)

In [None]:
df_experimental["LotFrontage"] = train["LotFrontage"]
df_experimental["LotArea"] = train["LotArea"]

In [None]:
plot(train, "Street")

In [None]:
df_experimental["Paved"] = np.where(train["Street"] == "Pave", 1, 0)

In [None]:
df_experimental

In [None]:
plot(train, "Alley")

In [None]:
# Probably won't include Alley
'''
df_experimental["Alley"] = train["Alley"]
df_experimental["Alley"] = df_experimental["Alley"].fillna(0)
df_experimental["Alley"] = np.where((df_experimental["Alley"] == "Grvl") | (df_experimental["Alley"] == "Pave"), 1, 0)
'''

In [None]:
df_experimental.head()

In [None]:
plot(train, "LotShape")

In [None]:
df_experimental["LotShape"] = train["LotShape"]
df_experimental["RegularLotShape"] = np.where(df_experimental["LotShape"] == "Reg", 1, 0)
df_experimental = df_experimental.drop("LotShape", axis=1)
df_experimental["RegularLotShape"].unique()

In [None]:
df_experimental.head()

In [None]:
plot(train, "LandContour")

In [None]:
df_experimental["LandContour"] = train["LandContour"]
df_experimental["LevelContour"] = np.where(df_experimental["LandContour"] == "Lvl", 1, 0)
df_experimental = df_experimental.drop("LandContour", axis=1)
df_experimental.head()

In [None]:
plot(train, "Utilities")

In [None]:
df_experimental["Utilities"] = train["Utilities"]
df_experimental["Utilities"] = df_experimental["Utilities"].fillna(0)
df_experimental["AllUtilities"] = np.where(df_experimental["Utilities"] == "AllPub", 1, 0)
df_experimental = df_experimental.drop("Utilities", axis=1)
df_experimental.head()

In [None]:
plot(train, "LotConfig")

In [None]:
df_experimental["LotConfig"] = train["LotConfig"]
df_experimental["InsideStreet"] = np.where(train["LotConfig"] == "Inside", 1, 0)
df_experimental = df_experimental.drop("LotConfig", axis=1)
df_experimental.head()

In [None]:
plot(train, "BldgType")

In [None]:
df_experimental["SingleFam"] = np.where(train["BldgType"] == "1Fam", 1, 0)
df_experimental

In [None]:
plot(train, "YearBuilt")

In [None]:
years = train["YearBuilt"].unique()
years.sort()
years

In [None]:
df_experimental["Age"] = list(abs(train["YearBuilt"] - train["YearRemodAdd"]))
df_experimental.head()

In [None]:
df_experimental["Age"].unique()

In [None]:
df_experimental["OverallQual"] = train["OverallQual"]
df_experimental["OverallCond"] = train["OverallCond"]

In [None]:
plot(train, "RoofStyle")

In [None]:
plot(train, "ExterQual")

In [None]:
exter_columns = ["ExterQual", "ExterCond"]
exterqual_dict = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1
}

for column in exter_columns:
    df_experimental[column] = train[column]
    for key in exterqual_dict:
        df_experimental.loc[df_experimental[column] == key, column] = exterqual_dict[key]

df_experimental

In [None]:
bsmt_columns = ["BsmtQual", "BsmtCond"]
df_experimental["FinalBsmtQual"] = np.zeros((1460, 1))

for column in bsmt_columns:
    df_experimental[column] = train[column]
    df_experimental[column] = df_experimental[column].fillna(0)

    for key in exterqual_dict:
        df_experimental.loc[df_experimental[column] == key, column] = exterqual_dict[key]
    df_experimental["FinalBsmtQual"] += df_experimental[column]
    df_experimental = df_experimental.drop(column, axis=1)
df_experimental

In [None]:
bst_exposure_dict = {
    "Gd": 4,
    "Av": 3,
    "Mn": 2,
    "No": 1
}

df_experimental["BsmtExposure"] = train["BsmtExposure"]

for key in bst_exposure_dict:
    df_experimental.loc[df_experimental["BsmtExposure"] == key, "BsmtExposure"] = bst_exposure_dict[key]

df_experimental["FinalBsmtQual"] += df_experimental["BsmtExposure"]
df_experimental = df_experimental.drop("BsmtExposure", axis=1)

df_experimental

In [None]:
bsmt_type_dict = {
    "GLQ": 6,
    "ALQ": 5,
    "BLQ": 4,
    "Rec": 3,
    "LwQ": 2,
    "Unf": 1
}

for i in range(1, 3):
    df_experimental[f"BsmtFinType{i}"] = train[f"BsmtFinType{i}"]
    df_experimental[f"BsmtFinType{i}"] = df_experimental[f"BsmtFinType{i}"].fillna(0)

    for key in bsmt_type_dict:
        df_experimental.loc[df_experimental[f"BsmtFinType{i}"] == key, f"BsmtFinType{i}"] = bsmt_type_dict[key]

    df_experimental["FinalBsmtQual"] += df_experimental[f"BsmtFinType{i}"]
    df_experimental = df_experimental.drop(f"BsmtFinType{i}", axis=1)
df_experimental

In [None]:
df_experimental["BsmtSF"] = train["TotalBsmtSF"]

In [None]:
df_experimental["HeatingQual"] = train["HeatingQC"]

for key in exterqual_dict:
    df_experimental.loc[df_experimental["HeatingQual"] == key, "HeatingQual"] = exterqual_dict[key]

df_experimental

In [None]:
df_experimental["CentralAir"] = np.where(train["CentralAir"] == "Y", 1, 0)

In [None]:
electrical_dict = {
    "SBrkr": 4,
    "FuseA": 3,
    "FuseF": 2,
    "FuseP": 1,
    "Mix": 2
}

df_experimental["ElectricQC"] = train["Electrical"]

for key in electrical_dict:
    df_experimental.loc[df_experimental["ElectricQC"] == key, "ElectricQC"] = electrical_dict[key]

df_experimental

In [None]:
df_experimental["TotalSF"] = train["1stFlrSF"] + train["2ndFlrSF"]
# TODO Add LowQualFinSF to this equation somehow

In [None]:
df_experimental

In [None]:
df_experimental["Bathrooms"] = train["FullBath"] + (train["HalfBath"] / 2)
# TODO Potentially add basement bathrooms and bedrooms
df_experimental

In [None]:
df_experimental["Bedrooms"] = train["BedroomAbvGr"]
df_experimental

In [None]:
plot(train, "KitchenQual")

In [None]:
kitchen_dict = {
    "Ex": 5,
    "Gd": 4,
    "TA": 3,
    "Fa": 2,
    "Po": 1
}

df_experimental[""]