In [13]:
import numpy as np
import pandas as pd
import pickle

from scipy.spatial.distance import cdist
from mpl_toolkits.mplot3d.axes3d import Axes3D

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [15]:
matplotlib.rcParams.update({'font.size': 12})
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
def apply_elbow_method(X):
    distortions = []
    K = range(2, 15)
    for k in K:
        kmeanModel = KMeans(n_clusters=k, random_state=42).fit(X)
        distortions.append(sum(np.min(cdist(X,
                                            kmeanModel.cluster_centers_,
                                            'euclidean'),
                                      axis=1)) / X.shape[0])

    plt.figure(figsize=(10, 6))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k');


def display_clusters_distribution(unique_labels, labels_counts):
    plt.figure(figsize=(8, 5))

    plt.bar(unique, counts)

    plt.xlabel('Clusters')
    plt.xticks(unique)
    plt.ylabel('Count')
    plt.title('Clusters distribution');


def reduce_dims_to_2D_space_with_PCA(df):
    pca = PCA(n_components=2)
    components = pca.fit_transform(df)
    return pd.DataFrame(data=components, columns=['component_1', 'component_2'])


def reduce_dims_to_3D_space_with_PCA(df):
    pca = PCA(n_components=3)
    components = pca.fit_transform(df)
    return pd.DataFrame(data=components, columns=['component_1', 'component_2', 'component_3'])


def reduce_dims_to_2D_space_with_TSNE(df):
    tsne = TSNE(n_components=2, random_state=42)
    components = tsne.fit_transform(df)
    return pd.DataFrame(data=components, columns=['component_1', 'component_2'])


def reduce_dims_to_3D_space_with_TSNE(df):
    tsne = TSNE(n_components=3, random_state=42)
    components = tsne.fit_transform(df)
    return pd.DataFrame(data=components, columns=['component_1', 'component_2', 'component_3'])


def display_components_in_3D_space(components_df, labels=None):
    components_with_labels_df = pd.concat([components_df, labels], axis=1)

    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')

    if labels is not None:
        if labels.nunique() > 10:
            p = ax.scatter(components_with_labels_df['component_1'],
                           components_with_labels_df['component_2'],
                           components_with_labels_df['component_3'],
                           c=labels, cmap=plt.get_cmap('jet'), alpha=0.5)
        else:
            p = ax.scatter(components_with_labels_df['component_1'],
                           components_with_labels_df['component_2'],
                           components_with_labels_df['component_3'],
                           c=labels, cmap=plt.get_cmap('jet', labels.nunique()), alpha=0.5)
    else:
        p = ax.scatter(components_with_labels_df['component_1'],
                       components_with_labels_df['component_2'],
                       components_with_labels_df['component_3'],
                       alpha=0.5)

    ax.set_xlabel('component_1')
    ax.set_ylabel('component_2')
    ax.set_zlabel('component_3')
    ax.set_title('3D mapping of objects')
    fig.colorbar(p);


def display_components_in_2D_space(components_df, labels=None):
    components_with_labels_df = pd.concat([components_df, labels], axis=1)

    if labels is not None:
        if labels.nunique() > 10:
            p = components_with_labels_df.plot(kind='scatter', x='component_1', y='component_2',
                                               c=labels.name, cmap=plt.get_cmap('jet'),
                                               alpha=0.5, figsize=(12, 6), sharex=False)
        else:
            p = components_with_labels_df.plot(kind='scatter', x='component_1', y='component_2',
                                               c=labels.name, cmap=plt.get_cmap('jet', labels.nunique()),
                                               alpha=0.5, figsize=(12, 6), sharex=False)
    else:
        p = components_with_labels_df.plot(kind='scatter', x='component_1', y='component_2',
                                           alpha=0.5, figsize=(12, 6))

    plt.xlabel('component_1')
    plt.ylabel('component_2')
    plt.title('2D mapping of objects');


def evaluate_preds(true_values_train, pred_values_train, true_values_test, pred_values_test):
    print('Train:\t' + 'R2 = ' + str(round(r2_score(true_values_train, pred_values_train), 4)) +
          '\tMAE = ' + str(round(mean_absolute_error(true_values_train, pred_values_train), 3)) +
          '\n' +
          'Test:\t' + 'R2 = ' + str(round(r2_score(true_values_test, pred_values_test), 4)) +
          '\tMAE = ' + str(round(mean_absolute_error(true_values_test, pred_values_test), 3))
          )

    plt.figure(figsize=(16, 6))

    plt.subplot(121)
    sns.scatterplot(x=pred_values_train, y=true_values_train)
    plt.plot([0, 500000], [0, 500000], linestyle='--', color='black')
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train: True vs Predicted values');

    plt.subplot(122)
    sns.scatterplot(x=pred_values_test, y=true_values_test)
    plt.plot([0, 500000], [0, 500000], linestyle='--', color='black')
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test: True vs Predicted values');

TRAIN_DATASET_PATH = '../data/housing_prepared.csv'
#TRAIN_DATASET_PATH = '../data/housing_train.csv'