## Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from ydata_profiling import ProfileReport
from sklearn.preprocessing import OrdinalEncoder
from scipy import stats

## Load data

In [None]:
DATA = pd.read_csv('data/input.csv')

In [None]:
%matplotlib inline
profile = ProfileReport(DATA, title="report")
profile

In [None]:
categorical_cols = DATA.select_dtypes(include=['object', 'category']).columns
categorical_cols = [c for c in categorical_cols if c != 'y']
numerical_cols = DATA.select_dtypes(include=['number']).columns

In [None]:
print('Columns categorical: ' + ', '.join(categorical_cols))

In [None]:
print('Columns nuemrical: ' + ', '.join(numerical_cols))

In [None]:
DATA[numerical_cols].describe()

In [None]:
def create_corr_matrix(data):
    corr = data.corr()
    fig, ax = plt.subplots()
    im = ax.imshow(corr, cmap='coolwarm')
    ax.set_xticks(np.arange(len(corr.columns)))
    ax.set_yticks(np.arange(len(corr.columns)))
    ax.set_xticklabels(corr.columns)
    ax.set_yticklabels(corr.columns)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    for i in range(len(corr.columns)):
        for j in range(len(corr.columns)):
            text = ax.text(j, i, round(corr.iloc[i, j], 2),
                           ha="center", va="center", color="w")
    plt.colorbar(im)
    plt.show()

create_corr_matrix(DATA[numerical_cols])

In [None]:
for c in numerical_cols:
    sns.displot(DATA, x=c, hue="y", kde=False, height=5, aspect=2)
    plt.show()

In [None]:
def plot_dim_reduced(x, reductor, title, y_label):
    x = x.drop(y_label, axis=1)
    x = pd.DataFrame(reductor.fit_transform(x), columns=[f'{title}1', f'{title}2'])
    x['y'] = DATA[y_label]
    sns.lmplot(data=x, x=f'{title}1', y=f'{title}2', hue=y_label, fit_reg=False)
    plt.show()

all_numerical_data = pd.DataFrame(OrdinalEncoder().fit_transform(DATA), columns=DATA.columns)
plot_dim_reduced(all_numerical_data, PCA(n_components=2), 'PCA', 'y')
plot_dim_reduced(all_numerical_data, TSNE(n_components=2, random_state=0), 'PCA', 'y')

In [None]:
pca = PCA(n_components=8).fit(all_numerical_data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.gcf().set_size_inches(7, 5)

In [None]:
n_df = DATA.select_dtypes(include='number')
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ys = encoder.fit_transform(DATA)[:,-1]
n_df['y'] = ys

In [None]:
n_df

In [None]:
sns.pairplot(n_df, hue='y')

In [None]:
# Outlier analysis
n_df_no_y = n_df.drop(['y'], axis=1)
z = np.abs(stats.zscore(n_df_no_y))
(z > 2).sum()

In [None]:
n_df_no_y[z > 2]['Age'].dropna().sort_values()

In [None]:
n_df_no_y[z <= 2]['Age'].dropna().sort_values()

In [None]:
n_df_no_y[z > 2]['FCVC'].dropna().sort_values()

In [None]:
n_df_no_y[z <= 2]['FCVC'].dropna().sort_values()

In [None]:
n_df_no_y[z > 2]['NCP'].dropna().sort_values()

In [None]:
n_df_no_y[z <= 2]['NCP'].dropna().sort_values()

In [None]:
n_df_no_y[z > 2]['FAF'].dropna().sort_values()

In [None]:
n_df_no_y[z <= 2]['FAF'].dropna().sort_values()

In [None]:
n_df_no_y[z > 2]['TUE'].dropna().sort_values()

In [None]:
n_df_no_y[z <= 2]['TUE'].dropna().sort_values()