# <font color='	#2E45B8'> 1. Data Preparation</font>
## <font color='	#2E45B8'>  1.1 Import libraries and data</font>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
data.head()

## <font color='	#2E45B8'> 1.2 Data inspection¶</font>
### <font color='	#2E45B8'> 1.2.1 Data types</font>

In [None]:
num_cols = data.select_dtypes(exclude='object').columns.tolist()
cat_cols = data.select_dtypes(include='object').columns.tolist()
print('There are', len(num_cols), 'numerical columns')
print('There are', len(cat_cols), 'categorical features')

### <font color='	#2E45B8'> 1.2.2 Explore missing values</font>

In [None]:
print('Number of missing value in this dataset is: ', data.isnull().sum().sum())

# <font color='	#2E45B8'> 2. EDA</font>
## <font color='	#2E45B8'>  2.1 Checking target variable</font>

In [None]:
target = data.iloc[:,-1]
target.describe()

In [None]:
### Chekcing target varlable
fig, ax  = plt.subplots(1,2,figsize=(12,4))
fig.patch.set_facecolor( '#F2F2F2')

sns.histplot(target, bins=30, color='#8abbd0', kde=True, ax=ax[0])
ax[0].patch.set_facecolor( '#F2F2F2')
ax[0].lines[0].set_color('#F97A1F')

sns.boxplot(x=target, ax=ax[1], color='#8abbd0')
ax[1].patch.set_facecolor( '#F2F2F2')

plt.tight_layout()

**Comment**  
> The distribution of target variable shows:
> - Binary classification (0/1 values)
> - Class balanced (roughly equal numuber of 0 and 1)

## <font color='	#2E45B8'>  2.2 Checking numerical features</font>

In [None]:
# Checking statstical summary for numerical columns
data = data.iloc[:,1:] # drop ID column
pd.set_option('display.max_rows', None)
data.describe().T.sort_values(by=['std'], ascending=False)

**Comment**  
> - Features `f2` and `f35` are orders of magnitude different from other features, we need to transform these features
> - Some features have very low variance, we may need to drop these features

In [None]:
# Calculate correlation between pair of features
correlation_table = pd.DataFrame(data.corr().unstack().sort_values().drop_duplicates(), columns=['correlation'])
correlation_table

**Comment**  
> - Low correlation
> - The highest negative correlation pair is `f55` and `target` with correlation of -11%
> - The highest positive correlation pair is `f34` and `target` with correlation of 14%

In [None]:
# Plots on numerical features to check data quality and data distribution
num_cols = data.select_dtypes(exclude='object').columns.tolist()[:-1]
color ='#8abbd0'
for feature in num_cols[:20]:
    fig, ax = plt.subplots(1,3, figsize=(15,3))
    fig.patch.set_facecolor( '#F2F2F2')
    sns.histplot(data[feature], bins=20, ax=ax[0],  color=color, kde=True)
    ax[0].lines[0].set_color('#F97A1F')
    sns.kdeplot(x=feature,data=data,ax=ax[1],shade=True, alpha=0.3)
    sns.boxplot(x=feature, data=data,ax=ax[2], color=color)
            
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()

**Comment**  
> - For the first 20 features, we can consider remove `f0,2,4,9,16,19`, as a lot of values are zero

In [None]:
for feature in num_cols[20:40]:
    fig, ax = plt.subplots(1,3, figsize=(15,3))
    fig.patch.set_facecolor( '#F2F2F2')
    sns.histplot(data[feature], bins=20, ax=ax[0],  color=color, kde=True)
    ax[0].lines[0].set_color('#F97A1F')
    sns.kdeplot(x=feature,data=data,ax=ax[1],shade=True, alpha=0.3)
    sns.boxplot(x=feature, data=data,ax=ax[2], color=color)
            
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()

**Comment**  
> - For feaures 20 - 39, we can consider remove `f20,23,24,27,28,30-33`

In [None]:
for feature in num_cols[40:60]:
    fig, ax = plt.subplots(1,3, figsize=(15,3))
    fig.patch.set_facecolor( '#F2F2F2')
    sns.histplot(data[feature], bins=20, ax=ax[0],  color=color, kde=True)
    ax[0].lines[0].set_color('#F97A1F')
    sns.kdeplot(x=feature,data=data,ax=ax[1],shade=True, alpha=0.3)
    sns.boxplot(x=feature, data=data,ax=ax[2], color=color)
            
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()

**Comment**  
> - For feaures 40 - 59, we can consider remove `f42, 44, ,46, 48, 49, 51, 52, 53, 56, 58, 59`

In [None]:
for feature in num_cols[60:80]:
    fig, ax = plt.subplots(1,3, figsize=(15,3))
    fig.patch.set_facecolor( '#F2F2F2')
    sns.histplot(data[feature], bins=20, ax=ax[0],  color=color, kde=True)
    ax[0].lines[0].set_color('#F97A1F')
    sns.kdeplot(x=feature,data=data,ax=ax[1],shade=True, alpha=0.3)
    sns.boxplot(x=feature, data=data,ax=ax[2], color=color)
            
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()

In [None]:
for feature in num_cols[80:100]:
    fig, ax = plt.subplots(1,3, figsize=(15,3))
    fig.patch.set_facecolor( '#F2F2F2')
    sns.histplot(data[feature], bins=20, ax=ax[0],  color=color, kde=True)
    ax[0].lines[0].set_color('#F97A1F')
    sns.kdeplot(x=feature,data=data,ax=ax[1],shade=True, alpha=0.3)
    sns.boxplot(x=feature, data=data,ax=ax[2], color=color)
            
    plt.suptitle(feature, fontfamily='serif', fontsize=16, color='#173b56', fontweight='bold')
    plt.tight_layout()