In [3]:
import pandas as pd  # to load and manipulate data for One-Hot-Encoding
import numpy as np  # to calculate mean and standard deviation
%matplotlib widget
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample  # downsample dataset
from sklearn.preprocessing import scale  # scale and center data
from sklearn.svm import SVC  # svm for classification
from sklearn.model_selection import train_test_split  # to split data in training and testing sets
from sklearn.model_selection import GridSearchCV  # this will do cross validation
from sklearn.metrics import confusion_matrix  # to create a confusion matrix
from sklearn.metrics import plot_confusion_matrix 
from sklearn.decomposition import PCA  # do PCA and plot data

## import data

In [4]:
df = pd.read_csv("default of credit card clients.tsv", header=1, sep="\t")

# or download
# df = pd.read("https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls",
#       header=1, sep="/t")

# rename column due to name to long
df.rename({"default payment next month": "DEFAULT"}, axis="columns", inplace=True)

# drop id column due to insignificant data for our ml model
df.drop("ID", axis="columns", inplace=True)

df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## identifying missing data

In [5]:
df.dtypes

LIMIT_BAL    int64
SEX          int64
EDUCATION    int64
MARRIAGE     int64
AGE          int64
PAY_0        int64
PAY_2        int64
PAY_3        int64
PAY_4        int64
PAY_5        int64
PAY_6        int64
BILL_AMT1    int64
BILL_AMT2    int64
BILL_AMT3    int64
BILL_AMT4    int64
BILL_AMT5    int64
BILL_AMT6    int64
PAY_AMT1     int64
PAY_AMT2     int64
PAY_AMT3     int64
PAY_AMT4     int64
PAY_AMT5     int64
PAY_AMT6     int64
DEFAULT      int64
dtype: object

In [6]:
# should only 1, 2, 3, 4
df["EDUCATION"].unique()

array([2, 1, 3, 5, 4, 6, 0], dtype=int64)

In [7]:
# should only have 1, 2, 3
df["MARRIAGE"].unique()

array([1, 2, 3, 0], dtype=int64)

## dealing with missing data
* ### only remove the 0's. The extra values in EDUCATION will be ignored
* ### not much missing data relatively speaking, so deleting the rows is just fine 

In [8]:
print("missing data: ", len(df.loc[(df["EDUCATION"] == 0) | (df["MARRIAGE"] == 0)]))
print("missing data: %", len(df.loc[(df["EDUCATION"] == 0) | (df["MARRIAGE"] == 0)])/len(df) * 100)

df_no_missing = df.loc[(df["EDUCATION"] != 0) & (df["MARRIAGE"] != 0)]

missing data:  68
missing data: % 0.002266666666666667


In [9]:
# no missing data anymore
df_no_missing["EDUCATION"].unique()

array([2, 1, 3, 5, 4, 6], dtype=int64)

In [10]:
df_no_missing["SEX"].unique()

array([2, 1], dtype=int64)

## downsampling
* ### our data set is too big & svm would take too much time
* ### Note: use random_state to recreate the same model

In [11]:
df_no_default = df_no_missing[df_no_missing["DEFAULT"] == 0]
df_default = df_no_missing[df_no_missing["DEFAULT"] == 1]

In [12]:
df_no_default_downsampled = resample(df_no_default, replace=False, n_samples=1000, random_state=42)
df_default_downsampled = resample(df_default, replace=False, n_samples=1000, random_state=42)

df_downsample = pd.concat([df_no_default_downsampled, df_default_downsampled])
df_downsample.shape

(2000, 24)

## split data

In [13]:
X = df_downsample.drop("DEFAULT", axis=1).copy()
X

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
641,130000,2,3,1,28,0,0,0,0,-2,...,50000,0,0,0,2500,1000,0,0,0,0
4678,170000,1,3,1,29,0,0,0,0,0,...,172307,35234,32869,33862,7200,7500,1200,1200,1500,1300
16004,180000,2,2,1,29,0,0,0,0,0,...,26310,26662,26166,26176,1800,1800,1500,1056,950,1000
22974,210000,2,2,2,32,-2,-2,-2,-2,-2,...,410,0,0,0,979,412,0,0,0,0
17535,190000,2,3,1,45,0,0,0,0,0,...,80548,81778,83082,84811,3300,3331,3359,2663,2751,3000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6154,30000,2,1,2,34,1,2,2,2,2,...,24053,25624,25977,25413,1396,0,1967,903,0,1100
14072,320000,1,1,1,43,-1,0,0,0,0,...,32159,31851,3105,-45,10000,5000,0,0,0,0
5387,50000,1,2,2,23,-1,-1,0,0,-1,...,20816,11804,1420,0,15000,20300,9500,1420,0,0
23192,150000,2,3,1,43,0,0,0,0,0,...,21000,22013,22449,22922,1973,1353,1366,802,833,837


In [14]:
y = df_downsample["DEFAULT"].copy()
y

641      0
4678     0
16004    0
22974    0
17535    0
        ..
6154     1
14072    1
5387     1
23192    1
8180     1
Name: DEFAULT, Length: 2000, dtype: int64

## one-hot encoding

In [15]:
X_encoded = pd.get_dummies(X, columns=["SEX", "EDUCATION", "MARRIAGE", "PAY_0", "PAY_2",
                                        "PAY_3", "PAY_4", "PAY_5", "PAY_6"])
X_encoded.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_5_7,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7
641,130000,28,100143,50456,50000,0,0,0,2500,1000,...,0,1,0,0,0,0,0,0,0,0
4678,170000,29,165027,168990,172307,35234,32869,33862,7200,7500,...,0,0,0,1,0,0,0,0,0,0
16004,180000,29,25781,26000,26310,26662,26166,26176,1800,1800,...,0,0,0,1,0,0,0,0,0,0
22974,210000,32,355,975,410,0,0,0,979,412,...,0,1,0,0,0,0,0,0,0,0
17535,190000,45,76433,78472,80548,81778,83082,84811,3300,3331,...,0,0,0,1,0,0,0,0,0,0


## centering and scaling data
* ### should always be done with svm (and other ml models...) to scale input data X

In [16]:
# default: test_size=0.25 & shuffle=True 
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42)
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

## preliminary svm

In [17]:
clf_svm = SVC(random_state=42)
clf_svm.fit(X_train_scaled, y_train)

SVC(random_state=42)

In [18]:
fig, ax = plt.subplots(figsize=(7, 4))
plot_confusion_matrix(clf_svm, X_test_scaled, y_test, values_format="d",
                        display_labels=["Did not default", "Defaulted"], ax=ax)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## optimize hyperparameters with grid search
* ### Note: svm works good out of the box, so optimizing does not have a huge impact
* ### specify a set of values & GridSearchCV tests all possible combinations of the parameters
* ### using radial basis function as kernel

In [19]:
# default: C=1 & gamma="scale"
param_grid = [
    {
        "C": [0.5, 1, 10, 100],  # C > 0
        "gamma": ["scale", 1, 0.1, 0.01, 0.001, 0.0001],
        "kernel": ["rbf"],
    },
]

optimal_params = GridSearchCV(SVC(), param_grid, 
                            cv=5,  # numbers of folds in cross validation
                            scoring="accuracy",  # default: scoring="accuracy", plenty methods available
                            verbose=0)

optimal_params.fit(X_train_scaled, y_train)
optimal_params.best_params_  # output: C = 100 and gamma = 0.001

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

## final svm
* ### with optimal hyperparameters: C=100 & gamma=0.001

In [20]:
clf_svm = SVC(random_state=42, C=100, gamma=0.001)
clf_svm.fit(X_train_scaled, y_train)

SVC(C=100, gamma=0.001, random_state=42)

In [21]:
fig, ax = plt.subplots(figsize=(7, 4))
plot_confusion_matrix(clf_svm, X_test_scaled, y_test, values_format="d",
                        display_labels=["Did not default", "Defaulted"], ax=ax)
fig.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## pca
* ### pca performs poorly with this data set. Not recommended 

In [22]:
pca = PCA()
# by default PCA() centers the data but does not scale it hence 
# instead of fit we need fit_transform
X_train_pca = pca.fit_transform(X_train_scaled)

per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
labels = [str(x) for x in range(1, len(per_var) + 1)]

fig, ax = plt.subplots(figsize=(7, 4))

ax.bar(x=range(1, len(per_var) + 1), height=per_var)
ax.tick_params(
    axis="x",  # changes apply to the x-axis
    which="both",  # both major and minor ticks are affected
    bottom=False,  # ticks along the bottom edge are off
    top=False,  # ticks along the top edge are off
    labelbottom=False)  # labels along the bottom edge are off
ax.set_xlabel("Percentage of Explained Variance")
ax.set_ylabel("Principal Component")
ax.set_title("Scree Plot")
fig.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## grid search with pc1 & pc2
* ### seems like PCA already sorted X_train_pca by pc's explained variance.<br>Can imagine pc1 is a column in the data set 

In [23]:
# get coords for pc1 & pc2
# seems like PCA already sorts pc's by explained variance
# can imagine pc1 is a column in the data set 
train_pc1_cords = X_train_pca[:, 0]
train_pc2_cords = X_train_pca[:, 1]

# center and scale pc's
pca_train_scaled = scale(np.column_stack((train_pc1_cords, train_pc2_cords)))

# train svm after dimension reduction with pca
param_grid = [{"C": [1, 10, 100, 1000], "gamma": [1, 0.1, 0.001, 0.0001], "kernel": ["rbf"]}]

optimal_params = GridSearchCV(SVC(), param_grid, cv=5, scoring="accuracy", verbose=0)
optimal_params.fit(pca_train_scaled, y_train)

optimal_params.best_params_  # output: C=1000 & gamma=0.001

{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

## prediction & plot pca
* ### remember: pca does poor job in this data set
* ### difficult code below
* ### pink part of graph predicted for testing data points to have not defaulted,<br> yellow to be defaulted
* ### the red & green data points shown are from the training data set 



In [27]:
clf_svm = SVC(random_state=42, C=1000, gamma=0.001)
clf_svm.fit(pca_train_scaled, y_train)

# transform the test dataset with the pca
X_test_pca = pca.transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)
test_pc1_coords = X_test_pca[:, 0]
test_pc2_coords = X_test_pca[:, 1]

# matrix of points to show decision regions
# this matrix will be a little larger than the 
# transformed pca points so we can plot them without
# being on the edge
x_min = test_pc1_coords.min() - 1
x_max = test_pc1_coords.max() + 1

y_min = test_pc2_coords.min() - 1
y_max = test_pc2_coords.max() + 1

# meshgird: return coordinate matrices from coordinate vectors (grid like)
xx, yy = np.meshgrid(
    np.arange(start=x_min, stop=x_max, step=0.01),
    np.arange(start=y_min, stop=y_max, step=0.01),
)

# classify every point in the matrix with svm. Points on the one side of the classification
# boundary will get 0 and points on the other side will get 1
Z = clf_svm.predict(np.column_stack((xx.ravel(), yy.ravel())))

# atm Z just long array of 0's and 1's. So use reshape() so that the classification
# (0s or 1s) corresponds to a specific point in the matrix
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots(figsize=(10, 10))

# use contourf to draw a filled contour plot using the matrix values and classifications.
# The contours will be filled according to the predicted classification (0s or 1s) in Z
ax.contourf(xx, yy, Z, alpha=0.1)

# create custom color for the actual points
cmap = colors.ListedColormap(["#e41a1c", "#4daf4a"])
# draw the actual data points - these will be colored by their known (not predicted) classifications.
# aloha = 0.7 let us see if we are covering up a points
scatter = ax.scatter(test_pc1_coords, test_pc2_coords, c=y_train,
                        cmap=cmap, s=100, edgecolors="k", alpha=0.7)

legend = ax.legend(scatter.legend_elements()[0], scatter.legend_elements()[1], loc="upper right")
legend.get_texts()[0].set_text("No Default")
legend.get_texts()[1].set_text("Yes Default")

ax.set_ylabel("PC2")
ax.set_xlabel("PC!")
ax.set_title("Decision surface using PCA transformed features")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …