<h1 id="title" style="color:white;background:black;">
    </br>
    <center>
        [TPS - Mar 2021] Basic EDA
    </center>
</h1>

# Import Libraries 📚

In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

In [None]:
pink = ["#861388","#E15A97","#EEABC4","#C799A6","#4B2840"]
blue = ["#C6EBBE","#A9DBB8","#5887FF","#55C1FF","#E9D2F4"]
custom_palette(pink)
custom_palette(blue)

# Reading the csv📚

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')

display(train_df.head())
display(test_df.head())
display(sample_submission.head())

# General Info.🔎

## Check Train & Test shape

In [None]:
print('Rows and Columns in train dataset:', train_df.shape)
print('Rows and Columns in test dataset:', test_df.shape)

## Missing values

In [None]:
display(sum(train_df.isnull().sum()))
display(sum(test_df.isnull().sum()))

# Basic EDA - Continuous Features

In [None]:
cont_features = [feature for feature in train_df.columns if 'cont' in feature]
cat_features = [feature for feature in train_df.columns if 'cat' in feature]

## Train

In [None]:
num_rows, num_cols = 6, 2
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, 24))
f.suptitle('Distribution of Continuous Features in Tran Dataset', fontsize=16)

for index, column in enumerate(train_df[cont_features].columns):
    i,j = (index // num_cols, index % num_cols)
    sns.kdeplot(train_df.loc[train_df['target'] == 0, column], color=pink[2], shade=True, ax=axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 1, column], color=blue[3], shade=True, ax=axes[i,j])

f.delaxes(axes[5, 1])
plt.tight_layout()
plt.show()

## Test

In [None]:
num_rows, num_cols = 6, 2
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, 24))
f.suptitle('Distribution of Continuous Features in Test Dataset', fontsize=16)

for index, column in enumerate(test_df[cont_features].columns):
    i,j = (index // num_cols, index % num_cols)
    sns.kdeplot(test_df[column], color=pink[1], shade=True, ax=axes[i,j])

f.delaxes(axes[5, 1])
plt.tight_layout()
plt.show()

## Correlation for Continuous Features in Train

In [None]:
corr = train_df[cont_features].corr().abs()
mask = np.triu(np.ones_like(corr, dtype=np.bool))

fig, ax = plt.subplots(figsize=(14, 14))

#plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)

# yticks
plt.yticks(rotation=0)
plt.show()

## Correlation of Continuous Features with Target

In [None]:
chart_df = pd.DataFrame(train_df[cont_features].corrwith(train_df['target']))
chart_df.columns = ['corr']
fig = px.bar(
    chart_df['corr'], 
    y="corr",
    title='Correlation of Continuous Features with Target'
)

fig.show()

# Categorical Features

## Train

In [None]:
train_0_df = train_df.loc[train_df['target'] == 0]
train_1_df = train_df.loc[train_df['target'] == 1]

num_rows, num_cols = 10,2
fig = make_subplots(rows=num_rows, cols=num_cols)

for index, column in enumerate(train_df[cat_features].columns):
    i,j = ((index // num_cols)+1, (index % num_cols)+1)
    data = train_0_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        marker_color=pink[1],
        name='target: 0', 
    ), row=i, col=j)

    data = train_1_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        marker_color=blue[3],
        name='target: 1', 
    ), row=i, col=j)
    
    fig.update_xaxes(title=column, row=i, col=j)
    fig.update_layout(barmode='stack')
    
fig.update_layout(
    autosize=False,
    width=1200,
    height=1600,
    showlegend=False,
)
fig.show()

## Test

In [None]:
num_rows, num_cols = 10,2
fig = make_subplots(rows=num_rows, cols=num_cols)

for index, column in enumerate(test_df[cat_features].columns):
    i,j = ((index // num_cols)+1, (index % num_cols)+1)
    data = train_0_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        marker_color=blue[2],
    ), row=i, col=j)

    fig.update_xaxes(title=column, row=i, col=j)
    fig.update_layout(barmode='stack')
    
fig.update_layout(
    autosize=False,
    width=1200,
    height=1600,
    showlegend=False,
)
fig.show()

# References
- https://www.kaggle.com/ruchi798/tps-march-2021-eda-rapids
- https://www.kaggle.com/andreshg/tps-march-a-complete-study#2.-Feature-Engineering-%F0%9F%94%A7
- https://www.kaggle.com/dwin183287/tps-mar-2021-eda-models