# <p style="background-color:skyblue; font-family:newtimeroman; font-size:160%; text-align:center">Tabular Playground Series April</p>
1. [Exploratory Data Analysis 📊](https://www.kaggle.com/zhaodianwen/tps-april-1-eda/)

# Exploratory Data Analysis 🧐

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
##loading data
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

### To compare train&test data at first glance:

In [None]:
train.head()

In [None]:
test.head()

In [None]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_indexs = train.index
test_indexs = test.index

df = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
df = df.drop('PassengerId', axis=1)

In [None]:
df.head()

### To get the information of train&test data:

In [None]:
def simple_eda(df):
    
    """
    This function helps us with simple data analysis.
    We may explore the common information about the dataset, missing values, features distribution and duplicated rows
    """
    
    # applying info() method
    print('---')
    print('Common Information')
    print('---')
    print(df.info())
    
    # missing values
    print('---')
    if df.isna().sum().sum() == 0:
        print('There are no missing values')
    else:
        print('Detected')
        display(df.isna().sum())
    
    
    # applying describe() method for categorical features
    print('---')
    print('Categorical Columns')
    print('Total {}'.format(len(df.select_dtypes(include='object').columns)))
    print('---')
    display(df.describe(include = 'object'))
    
    # same describe() but for continious features
    print('---')
    print('Continuous Columns')
    print('Total {}'.format(len(df.select_dtypes(include=['int', 'float']).columns)))
    print('---')
    display(df.describe())
    
    #checking for duplicated rows
    if df.duplicated().sum() == 0:
        print('---')
        print('There are no duplicates')
        print('---')
    else:
        print('---')
        print('Duplicates found')
        print('---')
        display(df[df.duplicated()])
    
    print('End of the report')

In [None]:
simple_eda(df)

### To have a look at the distribution of the target:

In [None]:
df['Survived'].value_counts()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
sns.distplot(train['Survived'], ax=ax[0])
sns.countplot(train['Survived'], ax=ax[1])

### To have a look at numerical and categorical data separately:

In [None]:
## Collecting all the features. 
feature_cols = train.drop(['PassengerId', 'Survived'], axis=1).columns

## Getting all the data that are not of "object" type. 
numerical_columns = train[feature_cols].select_dtypes(include=['int64','float64']).columns
categorical_columns = train[feature_cols].select_dtypes(exclude=['int64','float64']).columns

In [None]:
num_rows, num_cols = 3,2
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(12, 12))
f.suptitle('Distribution of Features', fontsize=16)

for index, column in enumerate(df[numerical_columns].columns):
    i,j = (index // num_cols, index % num_cols)
    g = sns.kdeplot(train[column], color="m", shade=True, label="%.2f"%(train[column].skew()), ax=axes[i,j])
    g = g.legend(loc="best")

plt.tight_layout()
plt.show()

In [None]:
corr = df[numerical_columns].corr().abs()
mask = np.triu(np.ones_like(corr, dtype=np.bool))

fig, ax = plt.subplots(figsize=(14, 14))

# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
plt.yticks(rotation=0)
plt.show()

## <p style="background-color:skyblue; font-family:newtimeroman; font-size:120%; text-align:center">Categorical Variables</p>

In [None]:
categorical_columns

### ProfileReport on train&test data

In [None]:
from pandas_profiling import ProfileReport

ProfileReportTrain = ProfileReport(train, title='Profile Report on Train data',html={'style':{'full_width':True}}) 
ProfileReportTest = ProfileReport(test, title='Profile Report on Test data',html={'style':{'full_width':True}}) 
ProfileReportDF = ProfileReport(df, title='Profile Report on df',html={'style':{'full_width':True}}) 

In [None]:
ProfileReportDF