In [659]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [660]:
path = "train.csv"
df = pd.read_csv(path)

In [661]:
# This takes the dataframe and removes bad features and then turns categorial variabels into boolean.
def data_conditioning(dataframe):
    dataframe.set_index("Id", inplace=True)
    categorical_columns = dataframe.select_dtypes(include=["object"]).columns

    for col in categorical_columns:
        dataframe = dataframe.drop(
            [col],
            axis=1,
        ).join(pd.get_dummies(df[col], prefix=f"{col}"))
    dataframe.fillna(dataframe.median(), inplace=True)
    return dataframe

In [662]:
# Data contitions and fills with median for missing values, and then scales the data
df = data_conditioning(dataframe=df)

columns_to_drop = [
    "Condition2_RRAe",
    "Condition2_RRAn",
    "Condition2_RRNn",
    "Electrical_Mix",
    "Exterior1st_ImStucc",
    "Exterior1st_Stone",
    "Exterior2nd_Other",
    "GarageQual_Ex",
    "Heating_Floor",
    "Heating_OthW",
    "HouseStyle_2.5Fin",
    "MiscFeature_TenC",
    "PoolQC_Fa",
    "RoofMatl_ClyTile",
    "RoofMatl_Membran",
    "RoofMatl_Metal",
    "RoofMatl_Roll",
    "Utilities_NoSeWa",
]

df = df.drop(columns=columns_to_drop)

scaler = StandardScaler()

# Separate features and target variable
X = df.drop(columns=["SalePrice"], axis=1)
y = df["SalePrice"]

In [663]:
# Correlation Matrix to choose which variables to focus on.
correlation_matrix = df.corr()
saleprice_correlation = (
    correlation_matrix["SalePrice"].abs().sort_values(ascending=False)
)

high_corr_vars_list = saleprice_correlation[saleprice_correlation > 0].index.tolist()

# Shortens the amount of variables to the ones with highest correlation.
high_corr_vars = df[high_corr_vars_list]
del high_corr_vars_list[0]
X = high_corr_vars.drop(columns=["SalePrice"], axis=1)
y = high_corr_vars["SalePrice"]

In [664]:
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train_S = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_S = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

# Linear Regression Model
reg = LinearRegression()

reg.fit(X_train, y_train)

print(reg.score(X_train, y_train))
print(reg.score(X_test, y_test))

0.9173529626699538
-307409436.91678685


In [665]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
B_forest = None
B_Score = 0

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=i
    )

    X_train_S = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_S = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)
    forest.fit(X_train_S, y_train)
    score = forest.score(X_test_S, y_test)
    if score > B_Score:
        B_Score = score
        B_forest = forest

print(B_Score)

0.9061492273609174


In [666]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [60],
    "max_features": [100],
}
grid_search = GridSearchCV(
    forest,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_log_error",
    return_train_score=True,
)
grid_search.fit(X_train_S, y_train)

best_forest = grid_search.best_estimator_
print(best_forest.score(X_test_S, y_test))
print(best_forest)
# Find out that the best one is the one I already have about.

0.8660359417163378
RandomForestRegressor(max_features=100, n_estimators=60)


In [669]:
path = "test.csv"
df = pd.read_csv(path)
df_index = df.Id

df_test = data_conditioning(dataframe=df)

df_test_S = pd.DataFrame(scaler.fit_transform(df_test), columns=df_test.columns)
df_test_S = df_test_S[high_corr_vars_list]

y_prediction = B_forest.predict(df_test_S)

output_df = pd.DataFrame({"Id": df_index, "SalePrice": y_prediction})
output_df.set_index("Id").to_csv("Housing_Price_Estimates_TP.csv")

Ideas 
- potential for multicolinearity (too many variables look to reduce some)
- Have missing values, better way to estimate the values of those.

In [670]:
y_pred = B_forest.predict(X_test_S)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

29013.30104528005
