# Load previously selected features

The file being imported below was constructed in the last exploratory notebooks (2.1-rp-hcad-eda-appraised-value-perc-diff). These data have been cleaned, merged, and selected so we can start preparing it for modeling.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from src.definitions import ROOT_DIR
from src.data.utils import save_pickle

In [None]:
plt.style.use('seaborn-poster')

In [None]:
features_fn = ROOT_DIR / 'data/interim/2016/features.pickle'
assert features_fn.exists()

In [None]:
features = pd.read_pickle(features_fn)

In [None]:
features.sample(10)

# Look at the target data distribution

In [None]:
from scipy import stats

In [None]:
# sns.distplot(klein['appr_val_diff'], fit=stats.norm)
fig = plt.figure(figsize=(8, 8))
sns.displot(features['tot_appr_val'], kde=True)
plt.xticks(rotation=70)
fig = plt.figure(figsize=(8, 8))
res = stats.probplot(features['tot_appr_val'], plot=plt)

The target (total appraised value) is right skewed, however in the QQ plot it appears to be normal.

In [None]:
features.shape

In [None]:
features.info()

# Find neighbors

I'd like to see if my target house was reasonably appraised in 2016. If we plot it on the histogram of the `tot_appr_val` we can see it lands near the high end.

In [None]:
_ = plt.figure(figsize=(8,8))
_ = (features['tot_appr_val']/1000).hist(bins=7)
_ = plt.vlines(292707/1000, 0, 16, color='red', label='Target house')
_ = plt.title('Finished houses (2015) in the subdivision')
_ = plt.xlabel('tot_appr_val x1000 ($)')
_ = plt.ylabel('Count')
_ = plt.legend(loc='upper right')

But it could just be part of a larger distribution, so I'd like to group the properties by nearest neighbors (no pun intended!) to add this label as a new feature hoping to improve the target estimation.

In [None]:
from sklearn.cluster import KMeans

In [None]:
features.columns

Let's drop the value columns to only group the houses by their characteristics (number of fixtures, areas).

In [None]:
no_values = features.drop(['tot_appr_val', 'prior_tot_appr_val'], axis=1).copy()

In [None]:
ks = range(1, 12)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(no_values)
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.figure(figsize=(8, 8))
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

Seems like 4 clusters is a good choice.

In [None]:
model = KMeans(n_clusters=4)

In [None]:
labels = model.fit_predict(no_values)

## TSNE
Now let's plot the selected labels using TSNE to find if these groups overlap or are separated.

In [None]:
from sklearn.manifold import TSNE

In [None]:
model = TSNE(learning_rate=100)

In [None]:
transformed = model.fit_transform(no_values)

In [None]:
color_map = {0: 'black',
             1: 'red',
             2: 'blue',
             3: 'green'} 

colors = [color_map[x] for x in labels]

plt.figure(figsize=(8, 8))
plt.scatter(transformed[:, 0], transformed[:, 1], c=colors, alpha=0.8)
plt.show()

Excellent! They are well separated indicating that these groups are meaningful.

Now let's add the labels found with K-meands back to the features dataframe.

In [None]:
features['kmeans_label'] = labels

In [None]:
features.head()

In [None]:
sns.stripplot(x='kmeans_label', y='tot_appr_val', data=features)
plt.hlines(292707, 0, 3, colors='red', label='Target House')
plt.legend(loc='upper right')

So it looks like the target house is part of a group, but even there lands on the high end of the appraised value.

# Prepare the data for modeling
Now that we have selected the features (areas and fixtures), built a new feature (K-means labels), verified that there are no null values, and selected the appropriate subset of the samples for the value modeling, we are ready to create the dummy features for the categorical variable (K-means labels), split the data, and scale it.

## Create dummy features for kmeans label

In [None]:
feat_dummy = pd.get_dummies(features, columns=['kmeans_label'], drop_first=True)

In [None]:
feat_dummy.head()

## Split the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = feat_dummy.drop(['tot_appr_val', 'prior_tot_appr_val'], axis=1)
y = feat_dummy['tot_appr_val']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Standardize data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
y_scaler = StandardScaler()

In [None]:
y_scaler.fit(y_train.to_numpy().reshape(-1, 1))

In [None]:
y_train_scaled = y_scaler.transform(y_train.to_numpy().reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.to_numpy().reshape(-1, 1))