# Porto Seguro’s Safe Driver Prediction EDA

## Import Libraries
- **numpy** 
> for linear algebra
- **pandas** 
> for data processing & deal with CSV data

In [None]:
import numpy as np
import pandas as pd

# Load Data
data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
train.info()

## Check missing value
- **missingno** 
> for visualizing missing value with Bar chart , Matrix etc...

In [None]:
import missingno as msno

# Convert -1 from training data copy to np.NaN
train_copy = train.copy().replace(-1, np.NaN)

# Visualize missing values
msno.bar(df=train_copy.iloc[:, 1:29], figsize=(13, 6));

In [None]:
msno.bar(df=train_copy.iloc[:, 29:], figsize=(13, 6));

In [None]:
msno.matrix(df=train_copy.iloc[:, 1:29], figsize=(13, 6));

###  Create Features Summary

In [None]:
# Feature dtypes
summary = pd.DataFrame(train.dtypes, columns=['dtype'])
# Number of Missing values (-1 count)
summary['num_missing'] = (train == -1).sum().values    
# Number of unique values by features
summary['num_uniques'] = train.nunique().values

summary.head()

In [None]:
data_forms = []
for col in train.columns:
    if 'bin' in col or col == 'target':
        data_forms.append('binary')
    elif 'cat' in col:
        data_forms.append('nominal')
    elif train[col].dtype == float:
        data_forms.append('continuous')
    elif train[col].dtype == int:
        data_forms.append('ordinal')
        
summary['role'] = data_forms

In [None]:
summary

In [None]:
summary[summary['role'] == 'nominal'].index

In [None]:
summary[summary['dtype'] == 'float64'].index

## Data visualization
- **mataplotlib**
> for visualization
- **seaborn**
> for statistical data visualization

In [None]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

### Target Value Distribution

In [None]:
mpl.rc('font', size=15)
plt.figure(figsize=(7, 6))

# Target Value Distribution
ax = sns.countplot(x='target', data=train)

total_size = len(train) # Total number of train data

# Display the target value ratio at the top of the bar
for patch in ax.patches:
    height = patch.get_height() # Shape height (Data Count)
    width = patch.get_width() # Shape width
    left_coord = patch.get_x() # The x-axis position of the left border of the shape
    percent = height/total_size*100 # Target Value Ratio
    
    # Set text in coordinates (x, y)
    ax.text(x=left_coord + width/2.0, #x-axis position
            y=height + 3000, # y-axis position
            s='{:1.1f}%'.format(percent), # Text
            ha='center') # Center Alignment

ax.set_title('Target Distribution');

In [None]:
def get_row_col_idx(idx, num_cols):
    # Subplot Row Index
    row_idx = idx//num_cols
    # Subplot Column Index
    for i in range(num_cols):
        if idx%num_cols == i:
            col_idx = i
    return row_idx, col_idx

In [None]:
def plot_target_ratio_by_features(cols, nrows, ncols):
    mpl.rc('font', size=9) # Set font size
    figure, axes = plt.subplots(nrows=nrows, ncols=ncols) # Create subplots
    figure.set_size_inches(12, 18) # Set Figure size
    plt.subplots_adjust(wspace=0.3, hspace=0.3) # Set margins between subplots

    for idx, col in enumerate(train[cols]): 
        row_idx, col_idx = get_row_col_idx(idx, ncols) # Subplot row, column location
        ax = axes[row_idx, col_idx] # Set the axis on which to plot the graph
        # Plot bar graph of target value 1 ratio by unique values on the ax axis
        sns.barplot(x=col, y='target', data=train, palette='Set2', ax=ax)

### Binary Features

In [None]:
bin_cols = summary[summary['role'] == 'binary'].index # Binary Features
# Plot bar graph of the target value 1 ratio by binary feature unique values
plot_target_ratio_by_features(bin_cols, 6, 3)

### Nominal Features

In [None]:
nom_cols = summary[summary['role'] == 'nominal'].index # Nominal Features

plot_target_ratio_by_features(nom_cols, 7, 2)

### Ordinal Features

In [None]:
ord_cols = summary[summary['role'] == 'ordinal'].index # Ordinal Features

plot_target_ratio_by_features(ord_cols, 8, 2)

### Continuous Features

In [None]:
cont_cols = summary[summary['role'] == 'continuous'].index # Continuous Features

figure, axes = plt.subplots(nrows=5, ncols=2)
figure.set_size_inches(12, 16)
plt.subplots_adjust(wspace=0.2, hspace=0.4)

for idx, col in enumerate(cont_cols):
    train[col] = pd.cut(train[col], 5) # Divide the value into 5 bins
    row_idx, col_idx = get_row_col_idx(idx, 2)
    ax = axes[row_idx, col_idx]
    sns.barplot(x=col, y='target', data=train, palette='Set2', ax=ax)
    ax.tick_params(axis='x', labelrotation=10) # Rotate x-axis label

In [None]:
train_copy = train_copy.dropna() # Delete np.NaN value

plt.figure(figsize=(10, 8))
cont_corr = train_copy[cont_cols].corr() # Correlation between continuous features
sns.heatmap(cont_corr, annot=True, cmap='OrRd'); # Plot heatmap