In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import h2o

In [None]:
#connecting to cluster
h2o.init(strict_version_check=False)

In [None]:
data_csv = "/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv"
data = h2o.import_file(data_csv)

### Basic Information about the Dataset

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
cols_names = data.columns #because we know the data type for all the columns (they are all ints)
cols_names

In [None]:
# Overall percentage of defaulting

print(data['default.payment.next.month'].sum() / len(data['default.payment.next.month']), "%", sep="")

### Analyzing Data Quality

In [None]:
#Let's print out the unique values for each categorical data type - this can tell us if we have any missing data

not_categorical = ['ID',
 'LIMIT_BAL',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

for col in cols_names:
    if col not in not_categorical:
        uniq_vals = h2o.as_list(data[col].unique(), use_pandas=False, header=False)
        uniq_vals = [val for sublist in uniq_vals for val in sublist] #flattening
        uniq_vals.sort()
        print(col + ": ", uniq_vals, "\n")

Looks like there are no missing values listed as '?.' However, looking at the range of these variables (from data homepage):

SEX: (1=male, 2=female)

EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)

MARRIAGE: (1=married, 2=single, 3=others)

AGE: Age in years

PAY_N: (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)

For Sex and Age, looks like no missing values.

For Education and Marriage, a 0 denotes a missing value.

For Pay_N, according to https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset/discussion/34608

> -2: Balance paid in full and no transactions this period (we may refer to this credit card account as having been 'inactive' this period)

>  0: Customer paid the minimum due amount, but not the entire balance. I.e., the customer paid enough for their account to remain in good standing, but did revolve a balance.
    
**This is only an assumption based on domain knowledge - no information from source paper or kaggle.**

But this means that these values are valid features.

### Let's count the number of missing values in Education and Marriage.


In [None]:
ed_counts = data[['EDUCATION']].table(data2=None,dense=True)

total = round(ed_counts['Count'].sum()) #rounding because of the python integer addition problem - see https://docs.python.org/2/tutorial/floatingpoint.html
print(total) #30000 should be the value

no_ed_missing = round(ed_counts[ed_counts['EDUCATION'] == 0]['Count'].sum())

print("Number of missing values:", no_ed_missing)
print("Percent missing: ", no_ed_missing*100/total, "%", sep="")

In [None]:
ma_counts = data[['MARRIAGE']].table(data2=None,dense=True)


total = round(ma_counts['Count'].sum()) #rounding because of the python integer addition problem - see https://docs.python.org/2/tutorial/floatingpoint.html
print(total) #30000 should be the value

no_ma_missing = round(ma_counts[ma_counts['MARRIAGE'] == 0]['Count'].sum())

print("Number of missing values:", no_ma_missing)
print("Percent missing: ", no_ma_missing*100/total, "%", sep="")

### Let's plot the different features to see if there are any specific patterns in distribution. 

#### Plotting the categorical features' values vs count/frequency

In [None]:
%matplotlib inline

#print(data[['EDUCATION']].table(data2=None,dense=True))

for col in cols_names:
    if col not in not_categorical:
        data[col].hist()

Some interesting points:

for Pay_N: 1 seems to occur the most
For SEX: Looks like dataset has more female than male
For Age: More younger people than older

To do: Check if these distributions match a normal population's distribution - does that matter?

#### Plotting the 'non-categorical features'' values vs count/frequency

In [None]:
for col in cols_names:
    if col in not_categorical:
        data[col].hist()

We can probably gain some information from calculating percentage of pay/bill and plotting that.

In [None]:
#Will have to see if this is possible for certain repayment statuses

for i in range (1, 7):
    pay_var = 'PAY_AMT' + str(i)
    bill_var = 'BILL_AMT' + str(i)
    frac_var = 'FRACT_PAY' + str(i)

    temp = data[[pay_var, bill_var, 'default.payment.next.month']]
    temp[frac_var] = temp[pay_var] / temp[bill_var]

    #Now, we want a table of only when the PAY_AMT > BILL_AMT. Which means FRACT_PAY > 1.
    mask = temp[frac_var] > 1
    temp = temp[mask,:]

    #as can't easily figure out how to create a line plot with h2o (only hist())
    temp = temp.as_data_frame()
    #print(temp)

    print("Month", i, ": ", temp['default.payment.next.month'].sum()*100 / len(temp['default.payment.next.month']), "%", sep="")

In [None]:
#calculating percent of people who've paid minimum that defualted for each month

data.rename(columns={"PAY_0": "PAY_1"})

for i in range (1, 7):
    status_var = 'PAY_' + str(i)

    temp = data[[status_var, 'default.payment.next.month']]

    #Now, we want a table of only when the PAY_0 == 0.
    mask = temp[status_var] == 0
    temp = temp[mask,:]

    #as can't easily figure out how to create a line plot with h2o (only hist())
    temp = temp.as_data_frame()
    #print(temp)

    print("Month", i, ": ", temp['default.payment.next.month'].sum()*100 / len(temp['default.payment.next.month']), "%", sep="")


In [None]:
for i in range (1, 7):
    status_var = 'PAY_' + str(i)

    temp = data[[status_var, 'default.payment.next.month']]

    #Now, we want a table of only when the PAY_0 == -1.
    mask = temp[status_var] == -1
    temp = temp[mask,:]

    #as can't easily figure out how to create a line plot with h2o (only hist())
    temp = temp.as_data_frame()
    #print(temp)

    print("Month", i, ": ", temp['default.payment.next.month'].sum()*100 / len(temp['default.payment.next.month']), "%", sep="")
    

In [None]:
status_var1 = 'PAY_1'
status_var6 = 'PAY_6'

temp = data[[status_var1, status_var6, 'default.payment.next.month']]

mask = temp[status_var1] == -1
temp = temp[mask,:]

mask = temp[status_var6] == -1
temp = temp[mask,:]

temp = temp.as_data_frame()

#print("Month", ": ", temp['default.payment.next.month'].sum()*100 / len(temp['default.payment.next.month']), "%", sep="")
#print("Month", ": ", temp['default.payment.next.month'].sum()*100 / len(temp['default.payment.next.month']), "%", sep="")

In [None]:
# If they have a payment status of -1 on their first month, they are [likely/not-likely] to default
# If they are [likely/not-likely] to get a -1 on their last month, they are [likely/not-likely] to default


first_month = 'PAY_1'
last_month = 'PAY_6'

temp1 = data[[first_month, 'default.payment.next.month']]
temp2 = data[[last_month, 'default.payment.next.month']]

#Now, we want a table of only when the PAY_0 == 0.
mask = temp1[first_month] == -1
temp1 = temp1[mask,:]

mask = temp2[last_month] == -1
temp2 = temp2[mask,:]

print("Month1 (recent month)", ": ", temp1['default.payment.next.month'].sum()*100 / len(temp1['default.payment.next.month']), "%", sep="")
print("Month6 (first month)", ": ", temp2['default.payment.next.month'].sum()*100 / len(temp2['default.payment.next.month']), "%", sep="")

In [None]:
import matplotlib.pyplot as plt
from numpy import arange

count = []

for i in range (1, 7):
    month = 'PAY_' + str(i)
    temp = data[[month, 'default.payment.next.month']]
    count.append(data[month].table())
    count[i-1] = count[i-1].as_data_frame()

    plt.plot(count[i-1]['PAY_' + str(i)].tolist(), count[i-1]['Count'].tolist())

plt.xticks(arange(-2, 8, step=1))
plt.legend(["Month_1", "Month_2", "Month_3", "Month_4", "Month_5", "Month_6"])

In [None]:
import matplotlib.pyplot as plt
from numpy import arange

count = []

for i in range (1, 7):
    count.append(data['PAY_' + str(i)].table())
    count[i-1] = count[i-1].as_data_frame()


    plt.plot(count[i-1]['PAY_' + str(i)].tolist(), count[i-1]['Count'].tolist())

plt.xticks(arange(-2, 8, step=1))
plt.legend(["Month_1", "Month_2", "Month_3", "Month_4", "Month_5", "Month_6"])

I admit the line plot isn't really that good looking, but it does give us some useful data.

For example, there are incredibly high negative percentages. **Have to investigate this further**
There are also values greater than 1 -> **This means that some entries have higher payment values than the bill statement** -> Is this because of bad data? Or is the value for the payment for the previous month -> Have to calculate an adjustment to see if it evens out. Or is it possible to pay more than the bill statement - it is possible, but should only be considered after discounting the other option.

Correlation matrix

In [None]:
corr_matrix = data[data.columns].cor()
corr_matrix

Better visualization:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


plt.figure(figsize=(10,10))

#converting to data frame - couldn't easily figure out how to do it with h2o
corr = corr_matrix.as_data_frame()
corr.index = corr_matrix.columns
sns.heatmap(corr, annot = True, cmap='RdYlGn', vmin=-1, vmax=1)
plt.title("Correlation Heatmap", fontsize=16)
plt.show()

In [None]:
#Let's look at the highly-correlated features - if any are highly correlated then it may help to drop one in the feature selection stage (we don't need linearly dependent features)

#Only makes sense for numerical features (and we actually have lots of categorical features)

### Statistical metrics such as mean/median/etc (use column.describe())

In [None]:
for col in cols_names:
    if col in not_categorical:
        print(data[col].describe())

### In future projects, some more DE tasks: 

#### Add plots such as certain features vs the target variable
#### Scatter plots like in https://github.com/h2oai/h2o-tutorials/blob/master/training/airlines_demo_solutions.ipynb