# Predicting Credit Card Default

If you are using Windows, don't forget to add:

C:\Users\"user_name"\Anaconda3\"environment_name"\Library\bin\graphviz\

to the PATH environment variable

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

### Back with the credit card default dataset

In [None]:
# Loading the dataset
DATA_DIR = '../data'
FILE_NAME = 'credit_card_default.csv'
data_path = os.path.join(DATA_DIR, FILE_NAME)
ccd = pd.read_csv(data_path, index_col="ID")
ccd.rename(columns=lambda x: x.lower(), inplace=True)
ccd.rename(columns={'default payment next month':'default'}, inplace=True)

# getting the groups of features
bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]
pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]
numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features

# Creating creating binary features
ccd['male'] = (ccd['sex'] == 1).astype('int')
ccd['grad_school'] = (ccd['education'] == 1).astype('int')
ccd['university'] = (ccd['education'] == 2).astype('int')
#ccd['high_school'] = (ccd['education'] == 3).astype('int')
ccd['married'] = (ccd['marriage'] == 1).astype('int')

# simplifying pay features 
pay_features= ['pay_' + str(i) for i in range(1,7)]
for x in pay_features:
    ccd.loc[ccd[x] <= 0, x] = 0

# simplifying delayed features
delayed_features = ['delayed_' + str(i) for i in range(1,7)]
for pay, delayed in zip(pay_features, delayed_features):
    ccd[delayed] = (ccd[pay] > 0).astype(int)
    
# creating a new feature: months delayed
ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)

## Splitting the dataset

In [None]:
numerical_features = numerical_features + ['months_delayed']
binary_features = ['male','married','grad_school','university']
X = ccd[numerical_features + binary_features]
y = ccd['default'].astype(int)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5/30, random_state=101)

In [None]:
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler
# 2. Create an instance of the class
scaler = StandardScaler()
# 3. Use the fit method of the instance
scaler.fit(X_train[numerical_features])
# 4. Use the transform method to perform the transformation
X_train.loc[:, numerical_features] = scaler.transform(X_train[numerical_features])

## Logistic Regression

### A simple Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression
simple_log_reg = LogisticRegression(C=1e6)
simple_log_reg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)

In [None]:
print("W0: {}, W1: {}".format(simple_log_reg.intercept_[0], simple_log_reg.coef_[0][0]))

In [None]:
def get_probs(months_delayed):
    m = scaler.mean_[-1]
    std = scaler.var_[-1]**.5
    x = (months_delayed - m)/std
    prob_default = 1/(1+np.exp(-simple_log_reg.intercept_[0] + -simple_log_reg.coef_[0][0]*x))
    return prob_default

In [None]:
months = np.arange(13)
pred_probs = get_probs(months)
pd.DataFrame({'months': months, 'pred_probs':pred_probs})

In [None]:
fig, ax = plt.subplots()
ax.plot(months, pred_probs)
ax.set_xlabel('Months delayed')
ax.set_ylabel('Probability of default')
ax.grid()

### A complete Logistic Regression model

In [None]:
log_reg = LogisticRegression(C=1e6)
log_reg.fit(X_train, y_train)

In [None]:
prob_log_reg = log_reg.predict_proba(X_train)
prob_log_reg[:10]

In [None]:
y_pred_log_reg = log_reg.predict(X_train)
y_pred_log_reg[:10]

In [None]:
np.all(y_pred_log_reg == (prob_log_reg[:,1] > 0.5))

In [None]:
pd.Series(data=log_reg.coef_[0], index=X_train.columns).sort_values(ascending=False).round(2)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_log_reg = accuracy_score(y_true=y_train, y_pred=y_pred_log_reg)
accuracy_log_reg

## Classification Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
class_tree = DecisionTreeClassifier(max_depth=3)
class_tree.fit(X_train, y_train)

In [None]:
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
from IPython.display import Image  
import pydotplus

In [None]:
dot_data = StringIO()
export_graphviz(decision_tree=class_tree,
                out_file=dot_data,
                filled=True,
                rounded=True,
                feature_names = X_train.columns,
                class_names = ['pay','default'],
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
dot_data = StringIO()
export_graphviz(decision_tree=class_tree,
                out_file=dot_data,
                filled=True,
                rounded=True,
                proportion=True,
                feature_names = X_train.columns,
                class_names = ['pay','default'],
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### How trees work

In [None]:
from sklearn.datasets import make_blobs

In [None]:
A, b = make_blobs(n_samples=200, n_features=2, cluster_std=0.6,
                  centers=[[-0.5,-1],[0.5,0.5]], shuffle=False, random_state=42)
plt.scatter(A[:, 0], A[:, 1], c=b)
plt.xlabel('X1', size=15)
plt.ylabel('X2', size=15);

In [None]:
plt.scatter(A[:, 0], A[:, 1], c=b)
plt.axhline(-0.6, c='red')
plt.xlabel('X1', size=15)
plt.ylabel('X2', size=15);

In [None]:
plt.scatter(A[:, 0], A[:, 1], c=b)
plt.axhline(-0.6, c='red')
plt.axvline(x=-0.1, ymin=0.34, c='red')
plt.xlabel('X1', size=15)
plt.ylabel('X2', size=15);

In [None]:
plt.scatter(A[:, 0], A[:, 1], c=b)
plt.axhline(-0.6, c='red')
plt.axvline(x=-0.1, ymin=0.34, c='red')
plt.axvline(x=0.7, ymax=0.34, c='red')
plt.xlabel('X1', size=15)
plt.ylabel('X2', size=15);

### Training a larger classification tree

In [None]:
class_tree = DecisionTreeClassifier(max_depth=6, min_samples_split=50)
class_tree.fit(X_train, y_train)
y_pred_class_tree = class_tree.predict(X_train)

In [None]:
accuracy_class_tree = accuracy_score(y_true=y_train, y_pred=y_pred_class_tree)
accuracy_class_tree

In [None]:
pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)

In [None]:
pd.Series(data=class_tree.feature_importances_, index=X_train.columns).sort_values(ascending=False).plot(kind='bar');

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=99,
                            max_features=6,
                            max_depth=6,
                            min_samples_split=100,
                            random_state=85)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_train)

In [None]:
accuracy_rf = accuracy_score(y_true=y_train, y_pred=y_pred_rf)
accuracy_rf

In [None]:
pd.Series(data=rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).round(3)

## Training vs Testing Error

In [None]:
y_pred_null = np.zeros_like(y_test)
accuracy_score(y_true=y_test, y_pred=y_pred_null)

In [None]:
## Remember to also standarize the numerical features in the testing set
X_test.loc[:, numerical_features] = scaler.transform(X_test[numerical_features])

In [None]:
## Calculating accuracy
accuracies = pd.DataFrame(columns=['train', 'test'], index=['LogisticReg','ClassTree','RF'])
model_dict = {'LogisticReg': log_reg, 'ClassTree': class_tree, 'RF': rf}
for name, model in model_dict.items():
    accuracies.loc[name, 'train'] = accuracy_score(y_true=y_train, y_pred=model.predict(X_train))
    accuracies.loc[name, 'test'] = accuracy_score(y_true=y_test, y_pred=model.predict(X_test))

accuracies

In [None]:
fig, ax = plt.subplots()
accuracies.sort_values(by='test', ascending=False).plot(kind='barh', ax=ax, zorder=3)
ax.grid(zorder=0)

## Multiclass classification

In [None]:
# Loading the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
# Training the logistic regression model
iris_log_reg = LogisticRegression(C=1e5)
iris_log_reg.fit(iris.data, iris.target)
iris_probs = iris_log_reg.predict_proba(iris.data)
iris_pred = iris_log_reg.predict(iris.data)

In [None]:
iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)
iris_pred_df['predicted_class'] = iris.target_names[iris_pred]
iris_pred_df.sample(12)