In [None]:
import pickle
import pathlib

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [None]:
def nonNull_percent(label, null_value = 0 ):
    f = data[label] != null_value
    return (data[label][f].value_counts().sum()/data[label].shape[0] *100)

def heatMapCorr(labels):
    correlation_matrix = data[labels + ["SalePrice"]].corr()

    # Create the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
    plt.title('Correlation Heatmap')
    plt.show()

In [None]:
dumm = []
model_data = data.copy()
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()
model_data = pd.get_dummies(model_data, drop_first=True)
model_data.info()
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dumm.append(col)
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

In [None]:
data = model_data
data.hist(figsize=(35,35))
print()

**Checking the distribution and format of the data**

In [None]:
# Lot related features
lot_cat = ["Lot.Frontage","Lot.Area","Lot.Shape"]
heatMapCorr(lot_cat)

In [None]:
# all seem relevant to the end result, so let's check their looks
data[lot_cat].hist(bins=20)
print()

In [None]:
# seems like lot area might use some logs
pd.DataFrame(np.log10(data["Lot.Area"])).hist()
print()

In [None]:
print(f'correlation with target: {data[["Land.Slope","SalePrice"]].corr()["SalePrice"][0]}')
# not the highest correlation, so let's check how much of it isn't null or common:
nonNull_percent("Land.Slope")
# only four percent of the data has a non-null value for this feature, so I will store it for now as a potential removal, as it also does not impact too much acording to the correlation.
lessThan5p = ["Land.Slope"]

In [None]:
# OverAll features
over_cat = ["Overall.Qual","Overall.Cond"]
data[over_cat].hist(bins=20) # these seem to reflect a rating
print()

In [None]:
# data["Overall.Rat"]  = data["Overall.Cond"] + data["Overall.Qual"]

heatMapCorr(over_cat)
# the overWhelming majority of houses in the dataSet are 4 in cond, so let's see if the remainder is significant
print(nonNull_percent("Overall.Cond",4))
# it is, so the feature may still differ one house to another in quite a few cases, no alterations will be done here and store as ratings
ratings = over_cat # we may like to avoid transforming those.

# data = data.drop(columns=over_cat)

In [None]:
print(data[["Mas.Vnr.Area","SalePrice"]].corr()["SalePrice"][0]) # high correlation...
data["Mas.Vnr.Area"].hist()
print(nonNull_percent("Mas.Vnr.Area")) # has a good non-null amount...
right_skewed = ["Mas.Vnr.Area"]

In [None]:
# External features
exter_cat = ["Exter.Qual","Exter.Cond"]
data[exter_cat].hist(bins=20) # these seem to reflect a more umbalanced rating
print()

In [None]:
# data["Exter.Rat"] = data["Exter.Qual"] + data["Exter.Cond"]
heatMapCorr(exter_cat)
# the overWhelming majority of houses in the dataSet are rated 2 on both, so let's see if the remainder is significant
print(f"qual: {nonNull_percent(exter_cat[0],2)}")
print(f"cond: {nonNull_percent(exter_cat[1],2)}")
# it is, so the feature may still differ one house to another in quite a few cases, no alterations will be done here and be stored as ratings
ratings.append(exter_cat[0])
ratings.append(exter_cat[1])
# data = data.drop(columns=exter_cat)

In [None]:
# Basement Features (Area)
basement_cat = ["BsmtFin.SF.1","BsmtFin.SF.2", "Bsmt.Unf.SF","Total.Bsmt.SF"]
heatMapCorr(basement_cat) # checking relevance...

In [None]:
data[basement_cat].hist()
for cat in basement_cat:
    print(cat, nonNull_percent(cat))
    
right_skewed += ["BsmtFin.SF.1","BsmtFin.SF.2", "Bsmt.Unf.SF"]

In [None]:
# Heating and Eletrical categories
he_cat = ["Electrical","Heating.QC"]
heatMapCorr(he_cat)  # seems relevant enough
data[he_cat].hist()
print(nonNull_percent(he_cat[0]))
print(nonNull_percent(he_cat[1]))
# storing the cat
categories = he_cat

In [None]:
# xcat features
xcat = ["X1st.Flr.SF","X2nd.Flr.SF"] 
heatMapCorr(xcat)
data[xcat].hist()
print(nonNull_percent(xcat[0]))
print(nonNull_percent(xcat[1]))
# No worries it seems, still, what is it?
right_skewed.append(xcat[1])

In [None]:
data[["Low.Qual.Fin.SF","SalePrice"]].corr()["SalePrice"][0]
nonNull_percent("Low.Qual.Fin.SF")
lessThan5p.append("Low.Qual.Fin.SF")

In [None]:
rooms = ["Full.Bath","Half.Bath", "Kitchen.AbvGr", "TotRms.AbvGrd"]
data["SqFtPerRoom"] =  data["Gr.Liv.Area"] / (data["TotRms.AbvGrd"] +
                                                       data["Full.Bath"] +
                                                       data["Half.Bath"] +
                                                       data["Kitchen.AbvGr"])
heatMapCorr(rooms+["SqFtPerRoom","Gr.Liv.Area"])
data[rooms + ["SqFtPerRoom","Gr.Liv.Area"]].hist()

In [None]:
# Porch Features
porch_cat = ["Enclosed.Porch", "Screen.Porch", "X3Ssn.Porch"]
heatMapCorr(porch_cat) # low relevance for everyone


for cat in porch_cat:
    print(nonNull_percent(cat))

In [None]:
data = data.drop(columns=["Pool.Area","Misc.Val"])

In [None]:
data["Lot.Area"] = np.log10(data["Lot.Area"])

In [None]:
from sklearn.preprocessing import PowerTransformer

data[right_skewed].hist()
p = PowerTransformer()
data[right_skewed] = p.fit_transform(data[right_skewed])
data[right_skewed].hist()

In [None]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean_eng.pkl'
with open(clean_data_path, 'wb') as file:
    pickle.dump(data, file)