<a href="https://colab.research.google.com/github/Requenamar3/datawrangling/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Structural Analysis

In [None]:
# https://github.com/fenago/datasets/raw/main/default_credit.xls

In [None]:
# Import basic libraries
import numpy as np
import pandas as pd
# import visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:
X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
X2: Gender (1 = male; 2 = female).
X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
X4: Marital status (1 = married; 2 = single; 3 = others).
X5: Age (year).
X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005.
X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005.

In [None]:
df = pd.read_excel('https://github.com/fenago/datasets/raw/main/default_credit.xls')
df.head(5)

In [None]:
# Getting Meta Data Information about the dataset
df.info()

In [None]:
df.describe().T

In [None]:
# Checking for Null Values
df.isnull().sum()

# Light Quality Analysis/Investigation

In [None]:
print('SEX ' + str(sorted(df['SEX'].unique())))

In [None]:
print('EDUCATION ' + str(sorted(df['EDUCATION'].unique())))  # treat 0, 5, and 6

In [None]:
print('MARRIAGE ' + str(sorted(df['MARRIAGE'].unique())))  # treat 0

In [None]:
print('PAY_0 ' + str(sorted(df['PAY_0'].unique()))) # treat -2 and 0

In [None]:
print('default.payment.next.month ' + str(sorted(df['default payment next month'].unique())))

In [None]:
fill = (df.EDUCATION == 0) | (df.EDUCATION == 5) | (df.EDUCATION == 6)
df.loc[fill, 'EDUCATION'] = 4

In [None]:
print('EDUCATION ' + str(sorted(df['EDUCATION'].unique())))  # treat 0, 5, and 6

In [None]:
fill = (df.MARRIAGE == 0)
df.loc[fill, 'MARRIAGE'] = 2

In [None]:
df = df.rename(columns={'default payment next month': 'DEFAULT', 'PAY_0': 'PAY_1'})
df.head()

# EDA:  Univaraite Analysis / Content Investigation

In [None]:
sns.countplot(x="DEFAULT", data=df)

In [None]:
df['DEFAULT'].value_counts()

In [None]:
6636 / (23364 + 6636)

In [None]:
sns.countplot(x="SEX", data=df)

In [None]:
df['SEX'].value_counts()

In [None]:
sns.countplot(x="EDUCATION", data=df)

In [None]:
df['EDUCATION'].value_counts()

In [None]:
sns.countplot(x="MARRIAGE", data=df)

In [None]:
df['MARRIAGE'].value_counts()

# Bivariate Analysis

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
edu = sns.countplot(x='SEX', hue='DEFAULT', data=df)
edu.set_xticklabels(['Male','Female'])
plt.show()

In [None]:
pd.crosstab(df.SEX,df.DEFAULT,normalize='index',margins=True)

In [None]:
sns.set(rc={'figure.figsize':(15,10)})
edu = sns.countplot(x='EDUCATION', hue='DEFAULT', data=df)
edu.set_xticklabels(['Graduate School','University','High School','Other'])
plt.show()

In [None]:
pd.crosstab(df.EDUCATION,df.DEFAULT,normalize='index')

In [None]:
sns.set(rc={'figure.figsize':(12,10)})
marriage = sns.countplot(x="MARRIAGE", hue='DEFAULT', data=df)
marriage.set_xticklabels(['Married','Single','Other'])
plt.show()

In [None]:
pd.crosstab(df.MARRIAGE,df.DEFAULT,normalize='index',margins=True)

In [None]:
pd.crosstab(df.PAY_1,df.DEFAULT,margins=True)

In [None]:
pd.crosstab(df.AGE,df.DEFAULT)

In [None]:
pd.crosstab(df.AGE,df.DEFAULT,normalize='index',margins=True)

# Correlation

In [None]:
sns.set(rc={'figure.figsize':(30,10)})
sns.set_context("talk", font_scale=0.7)

In [None]:
sns.heatmap(df.iloc[:,1:].corr(method='spearman'), cmap='rainbow_r', annot=True)

In [None]:
df.drop("DEFAULT", axis=1).apply(lambda x: x.corr(df.DEFAULT,method='spearman'))
