# <p style="background-color:skyblue; font-family:newtimeroman; font-size:140%; text-align:center">Google Brain-Ventilator Pressure Prediction</p> Exploratory Data Analysis 🧐

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('../input/ventilator-pressure-prediction/train.csv')
test_df  = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
submission=pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

### Train Dataset Analysis at First Glance:

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
### To have a look at the distribution of the target:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
sns.distplot(train_df['pressure'], ax=ax[0])
sns.countplot(train_df['pressure'], ax=ax[1])

### Test Dataset Analysis at First Glance:

In [None]:
test_df.describe()

In [None]:
test_df.head()

### Join Train and Test Datasets in order to Obtain An Overview:

In [None]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_indexs = train_df.index
test_indexs = test_df.index

all_df = pd.concat(objs=[train_df, test_df], axis=0).reset_index(drop=True)
all_df = all_df.drop('id', axis=1)
all_df = all_df.drop('pressure', axis=1)

In [None]:
all_df.head()

In [None]:
def simple_eda(df):
    
    """
    This function helps us with simple data analysis.
    We may explore the common information about the dataset, missing values, features distribution and duplicated rows
    """
    
    # applying info() method
    print('---')
    print('Common Information')
    print('---')
    print(df.info())
    
    # missing values
    print('---')
    if df.isna().sum().sum() == 0:
        print('There are no missing values')
    else:
        print('Detected')
        display(df.isna().sum())
    
    # same describe() method for continious features
    print('---')
    print('Continuous Columns')
    print('Total {}'.format(len(df.select_dtypes(include=['int', 'float']).columns)))
    print('---')
    display(df.describe())
    
    #checking for duplicated rows
    if df.duplicated().sum() == 0:
        print('---')
        print('There are no duplicates')
        print('---')
    else:
        print('---')
        print('Duplicates found')
        print('---')
        display(df[df.duplicated()])
    
    print('End of the report')

In [None]:
simple_eda(all_df)

In [None]:
all_df.columns

In [None]:
num_rows, num_cols = 3,2
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(12, 12))
f.suptitle('Distribution of Features', fontsize=16)

for index, column in enumerate(all_df.columns):
    i,j = (index // num_cols, index % num_cols)
    g = sns.kdeplot(all_df[column], color="m", shade=True, label="%.2f"%(all_df[column].skew()), ax=axes[i,j])
    g = g.legend(loc="best")

plt.tight_layout()
plt.show()

In [None]:
corr = all_df.corr().abs()
mask = np.triu(np.ones_like(corr, dtype=np.bool))

fig, ax = plt.subplots(figsize=(14, 14))

# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
plt.yticks(rotation=0)
plt.show()

## Please upvote if you like my work. 👍
## Thanks a lot for your encouragement. Have a nice day! 😉