<h1 id="title" style="color:white;background:black;">
    </br>
    <center>
        [TPS - June 2021] Basic EDA
    </center>
</h1>

[[TPS-May] Categorical EDA](https://www.kaggle.com/subinium/tps-may-categorical-eda) inspired me and this notebook has just previous default setting applied.

If you like this, check out the original notebook as well.

# Import Libraries 📚

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Default Setting

- https://www.kaggle.com/subinium/dark-mode-visualization-apple-version

In [None]:
from cycler import cycler


raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]

raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
]

raw_gray_light_palette = [
    (142, 142, 147),# Gray
    (174, 174, 178),# Gray (2)
    (199, 199, 204),# Gray (3)
    (209, 209, 214),# Gray (4)
    (229, 229, 234),# Gray (5)
    (242, 242, 247),# Gray (6)
]

raw_gray_dark_palette = [
    (142, 142, 147),# Gray
    (99, 99, 102),  # Gray (2)
    (72, 72, 74),   # Gray (3)
    (58, 58, 60),   # Gray (4)
    (44, 44, 46),   # Gray (5)
    (28, 28, 39),   # Gray (6)
]


light_palette = np.array(raw_light_palette)/255
dark_palette = np.array(raw_dark_palette)/255
gray_light_palette = np.array(raw_gray_light_palette)/255
gray_dark_palette = np.array(raw_gray_dark_palette)/255

mpl.rcParams['axes.prop_cycle'] = cycler('color',dark_palette)
mpl.rcParams['figure.facecolor']  = gray_dark_palette[-2]
mpl.rcParams['figure.edgecolor']  = gray_dark_palette[-2]
mpl.rcParams['axes.facecolor'] =  gray_dark_palette[-2]

white_color = gray_light_palette[-2]
mpl.rcParams['text.color'] = white_color
mpl.rcParams['axes.labelcolor'] = white_color
mpl.rcParams['axes.edgecolor'] = white_color
mpl.rcParams['xtick.color'] = white_color
mpl.rcParams['ytick.color'] = white_color

mpl.rcParams['figure.dpi'] = 200

mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

In [None]:
custom_palette(dark_palette)

# Reading the csv📚

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

display(train_df.head())
display(test_df.head())
display(sample_submission.head())

# General Info.🃏

## Check Train & Test shape

In [None]:
print('Rows and Columns in train dataset:', train_df.shape)
print('Rows and Columns in test dataset:', test_df.shape)

## Missing values

In [None]:
display(sum(train_df.isnull().sum()))
display(sum(test_df.isnull().sum()))

# Basic EDA 🏕️

In [None]:
features = [feature for feature in train_df.columns if 'feature' in feature]
target_class = [target for target in sample_submission.columns if 'Class' in target]

## Target Distribution

In [None]:
fig, ax = plt.subplots()
sns.countplot(x='target', data=train_df, order=sorted(train_df['target'].unique()), ax=ax)
ax.set_ylim(0, 63000)
ax.set_title('Target Distribution', weight='bold')
plt.show()

## Train - Describe()

In [None]:
train_df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

## Test - Describe()

In [None]:
test_df.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

## Check Statistics in train and test

In [None]:
def diff_color(x):
    color = 'red' if x<0 else ('green' if x > 0 else 'black')
    return f'color: {color}'

(train_df.describe() - test_df.describe())[test_df.columns].T.iloc[1:,1:].style\
        .bar(subset=['mean', 'std'], align='mid', color=['#d65f5f', '#5fba7d'])\
        .applymap(diff_color, subset=['min', 'max'])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

y = np.array([train_df[f'feature_{i}'].nunique() for i in range(75)])
y2 = np.array([test_df[f'feature_{i}'].nunique() for i in range(75)])
comp = y-y2


ax.bar(range(75), y2, alpha=0.7, color=gray_dark_palette[0], label='Test Dataset')
ax.bar(range(75),  comp*(comp>0), bottom=y2, color=dark_palette[2], alpha=0.7, label='Train > Test')
ax.bar(range(75), comp*(comp<0), bottom=y2-comp*(comp<0), color=dark_palette[3], alpha=0.7, label='Train < Test')

ax.set_yticks(range(0, 80, 5))
ax.margins(0.02)
ax.grid(axis='y', linestyle='--', zorder=5)
ax.set_title('# of Features Unique Values (Train/Test)', loc='left', fontweight='bold')
ax.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

y = [train_df[f'feature_{i}'].nunique() for i in range(75)]

for ax in axes:
    ax.bar(range(75), y, zorder=10)
    ax.set_yticks(range(0, 80, 5))
    ax.margins(0.02)
    ax.grid(axis='y', linestyle='--', zorder=5)
    y.sort()

axes[0].set_title('# of Features Unique Values (Raw)', loc='left', fontweight='bold')
axes[1].set_title('# of Features Unique Values (Sorted)', loc='left', fontweight='bold')

plt.show()

In [None]:
fig, axes = plt.subplots(19, 4, figsize=(10, 16))

for idx, ax in zip(range(75), axes.flatten()):
    cnt = train_df[f'feature_{idx}'].value_counts().sort_index()
    sns.kdeplot(x=f'feature_{idx}', 
                hue='target', hue_order=target_class,
                data=train_df,
                alpha=0.5, 
                linewidth=0.6, fill=True,
                legend=False,
                ax=ax)
    
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    cnt = len(train_df[f'feature_{idx}'].unique())
    ax.set_title(f'Feature_{idx}({cnt})', loc='right', weight='bold', fontsize=11)

axes.flatten()[-1].axis('off')    
axes.flatten()[-2].axis('off')

fig.supxlabel('Distribution by class (by feature)', ha='center', fontweight='bold')

#f.delaxes(axes[19, 1])
fig.tight_layout()
plt.show()

In [None]:
zero_data = ((train_df.iloc[:,:75]==0).sum() / len(train_df) * 100)[::-1]
fig, ax = plt.subplots(1,1,figsize=(10, 19))

ax.barh(zero_data.index, 100, color='#dadada', height=0.6)
barh = ax.barh(zero_data.index, zero_data, color=light_palette[1], height=0.6)
ax.bar_label(barh, fmt='%.01f %%', color='black')
ax.spines[['left', 'bottom']].set_visible(False)

ax.set_xticks([])

ax.set_title('# of Zeros (by feature)', loc='center', fontweight='bold', fontsize=15)    
plt.show()

In [None]:
fig, axes = plt.subplots(13, 4, figsize=(10, 16))

mean = train_df.groupby('target').mean().sort_index()
std = train_df.groupby('target').std().sort_index()

for idx, ax in zip(range(75), axes.flatten()):
    ax.bar(mean[f'feature_{idx}'].index, mean[f'feature_{idx}'], 
           color=dark_palette[:4], width=0.6)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.margins(0.1)
    ax.spines['left'].set_visible(False)
    ax.set_title(f'Feature_{idx}', loc='right', weight='bold', fontsize=11)

axes.flatten()[-1].axis('off')    
axes.flatten()[-2].axis('off')

fig.supxlabel('Average by class (by feature)', ha='center', fontweight='bold')

fig.tight_layout()
plt.show()

## Correlation

In [None]:
corr = train_df[features].corr().abs()
mask = np.triu(np.ones_like(corr, dtype=np.bool))

fig, ax = plt.subplots(figsize=(14, 14))

#plot heatmap
sns.heatmap(corr,
            square=True, center=0, linewidth=0.2,
            cmap=sns.diverging_palette(240, 10, as_cmap=True),
            mask=mask, ax=ax) 

# yticks
ax.set_title('Correlation of features', loc='left', fontweight='bold')
plt.show()

Most features are weak correlations - `0.14` or `less`.

# Class distribution visualization with t-SNE

In [None]:
%%time
import sys
!cp ../input/rapids/rapids.0.19.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import cuml

from cuml.manifold import TSNE
import sklearn.manifold as tsne_sklearn
print('cuML version',cuml.__version__)

In [None]:
y = np.array( [int(v.split('_')[1]) for v in train_df.target.values ] )
train_df.drop( ['id','target'], inplace=True, axis=1 )

In [None]:
%%time
tsne = TSNE(n_components=2, perplexity=30, random_state=2021)
train_2D_rapids = tsne.fit_transform( train_df.values )

In [None]:
plt.scatter(train_2D_rapids[:,0], train_2D_rapids[:,1], c=y, s=0.5)

In [None]:
for i in range(5,50,5):
    tsne = TSNE(n_components=2, perplexity=i)
    train_2D = tsne.fit_transform(train_df)
    plt.title(f"perplexity: {i}")
    plt.scatter(train_2D[:,0], train_2D[:,1], c=y, s=0.5)
    plt.show()

The class distribution seems very mixed.

# References
- https://www.kaggle.com/subinium/tps-may-categorical-eda
- https://www.kaggle.com/titericz/t-sne-visualization-with-rapids