# Exploratory Data Analysis for the Taiwan Default Credit data set 

## Imports 

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Reading the data 

In [25]:
default_credit_df = pd.read_csv('../data/raw/credit_card_default.csv')

-----

## Summary of the data set

## Partition the data set into training and test sets

Before proceeding further, we will split our data set into train and test set. $20$ % of the observations will be included in the test data and $80$ % in the train data set. Overall `default_of_credit_card_clients` has $30,000$ observations, thus the test set should have enough examples to provide good affirmation for the model: more precisely, the train set will have $24000$ observations, and test set $6000$.

Also, throughout the data analysis `random_state=123` will be used to make sure the results are consistent. 

In [42]:
# splitting the dataset into train and test sets

train_df, test_df = train_test_split(default_credit_df, test_size=0.2, random_state=123)

In [46]:
# printing the number of observations for train and test sets

print('The number of observations for train set: ', train_df['default.payment.next.month'].shape[0])
print('The number of observations for test set: ', test_df['default.payment.next.month'].shape[0])

The number of observations for train set:  24000
The number of observations for test set:  6000


In [None]:
result_dict = {}

In [50]:
# percentage of zeros and ones in default column train set
train_percent_defaults = train_df['default.payment.next.month'].value_counts(normalize=True) * 100

# percentage of zeros and ones in default column test set
test_percent_defaults = test_df['default.payment.next.month'].value_counts(normalize=True)

# count of observations were default is one or zero in train set 
train_yes_default = len(train_df[train_df['default.payment.next.month'] == 1])
train_no_default = len(train_df[train_df['default.payment.next.month'] == 0])

# count of observations were default is one or zero in test set 
test_yes_default = len(test_df[test_df['default.payment.next.month'] == 1])
test_no_default = len(test_df[test_df['default.payment.next.month'] == 0])

In [55]:
train_percent_df = pd.DataFrame(train_percent_defaults)
train_percent_df.index.name = ''
train_percent_df.rename(columns = 
                        {'default.payment.next.month': 'Default Payment Percent'}, 
                        index = {0: 'No', 1: 'Yes'})

Unnamed: 0,Default Payment Count Percent
No,77.783333
Yes,22.216667


## Exploratory analysis on the training data set