In [308]:
import sys                             # Read system parameters.
import warnings 
warnings.filterwarnings("ignore")
import numpy as np                     # Work with multi-dimensional arrays and matrices.
import pandas as pd                    # Manipulate and analyze data.
import matplotlib as mpl               # Create 2D charts.
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns                   # Perform data visualization.
import sklearn                         # Perform data mining and analysis.
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score, f1_score)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from sklearn.tree import export_graphviz
from IPython.display import Image

from ucimlrepo import fetch_ucirepo 

## 1. Credit card applications
<p>Commercial banks receive <em>a lot</em> of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, just like the real banks do.</p>
<p><img src="https://assets.datacamp.com/production/project_558/img/credit_card.jpg" alt="Credit card being held in hand"></p>
<p>We'll use the <a href="http://archive.ics.uci.edu/ml/datasets/credit+approval">Credit Card Approval dataset</a> from the UCI Machine Learning Repository.

## 2. Import Pandas

1. Import pandas and alias it as pd
2. Load the dataset cc_approvals.data into a cc_apps dataframe.
    - Set the header argument to None.
3. Print the first five rows.
4. Drop the columns 11 and 13.

In [322]:
df = pd.read_csv("datasets/cc_approvals.data",header=None)
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [323]:
df.drop(columns=[11, 13], inplace=True)

In [324]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,12,14,15
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,g,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,g,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,g,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,g,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,g,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,g,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,g,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,g,750,-


## 3. Explore the dataset

1. Print the basic statistics.
2. Print the information of the dataset.
3. Print the last 17 rows.

In [325]:
df.describe

<bound method NDFrame.describe of     0      1       2  3  4   5   6     7  8  9   10 12   14 15
0    b  30.83   0.000  u  g   w   v  1.25  t  t   1  g    0  +
1    a  58.67   4.460  u  g   q   h  3.04  t  t   6  g  560  +
2    a  24.50   0.500  u  g   q   h  1.50  t  f   0  g  824  +
3    b  27.83   1.540  u  g   w   v  3.75  t  t   5  g    3  +
4    b  20.17   5.625  u  g   w   v  1.71  t  f   0  s    0  +
..  ..    ...     ... .. ..  ..  ..   ... .. ..  .. ..  ... ..
685  b  21.08  10.085  y  p   e   h  1.25  f  f   0  g    0  -
686  a  22.67   0.750  u  g   c   v  2.00  f  t   2  g  394  -
687  a  25.25  13.500  y  p  ff  ff  2.00  f  t   1  g    1  -
688  b  17.92   0.205  u  g  aa   v  0.04  f  f   0  g  750  -
689  b  35.00   3.375  u  g   c   h  8.29  f  f   0  g    0  -

[690 rows x 14 columns]>

In [326]:
df.info

<bound method DataFrame.info of     0      1       2  3  4   5   6     7  8  9   10 12   14 15
0    b  30.83   0.000  u  g   w   v  1.25  t  t   1  g    0  +
1    a  58.67   4.460  u  g   q   h  3.04  t  t   6  g  560  +
2    a  24.50   0.500  u  g   q   h  1.50  t  f   0  g  824  +
3    b  27.83   1.540  u  g   w   v  3.75  t  t   5  g    3  +
4    b  20.17   5.625  u  g   w   v  1.71  t  f   0  s    0  +
..  ..    ...     ... .. ..  ..  ..   ... .. ..  .. ..  ... ..
685  b  21.08  10.085  y  p   e   h  1.25  f  f   0  g    0  -
686  a  22.67   0.750  u  g   c   v  2.00  f  t   2  g  394  -
687  a  25.25  13.500  y  p  ff  ff  2.00  f  t   1  g    1  -
688  b  17.92   0.205  u  g  aa   v  0.04  f  f   0  g  750  -
689  b  35.00   3.375  u  g   c   h  8.29  f  f   0  g    0  -

[690 rows x 14 columns]>

In [327]:
df.dtypes

0      object
1      object
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
12     object
14      int64
15     object
dtype: object

In [328]:
df.count()

0     690
1     690
2     690
3     690
4     690
5     690
6     690
7     690
8     690
9     690
10    690
12    690
14    690
15    690
dtype: int64

In [329]:
df.value_counts()

0  1      2       3  4  5   6  7      8  9  10  12  14    15
?  20.08  0.125   u  g  q   v  1.000  f  t  1   g   768   +     1
b  30.17  6.500   u  g  cc  v  3.125  t  t  8   g   1200  +     1
   29.67  1.415   u  g  w   h  0.750  t  t  1   g   100   +     1
   29.83  1.250   y  p  k   v  0.250  f  f  0   g   0     -     1
          2.040   y  p  x   h  0.040  f  f  0   g   1     -     1
                                                               ..
   16.50  0.125   u  g  c   v  0.165  f  f  0   g   0     -     1
   16.92  0.335   y  p  k   v  0.290  f  f  0   s   0     -     1
   17.08  0.085   y  p  c   v  0.040  f  f  0   g   722   -     1
          0.250   u  g  q   v  0.335  f  t  4   g   8     -     1
   ?      10.500  u  g  x   v  6.500  t  f  0   g   0     +     1
Name: count, Length: 690, dtype: int64

## 4. Train Test Split

Do not split the dataset into X and y, just split the original dataset.

random_state=42

test_size=0.33

## 5. Handling Missing Values

Convert any '?' to a NaN value from both training and testing sets.

In [330]:
df = df.replace('?', np.nan)
df.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,12,14,15
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,g,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,g,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,g,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,g,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,g,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,g,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,g,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,g,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,g,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,g,2,-


## 6. Handling Missing Values

Impute the numerical data for both training and testing sets with mean value.

In [331]:
num_col=[1, 2, 7, 10, 14]
df[num_col].isnull().sum()

1     12
2      0
7      0
10     0
14     0
dtype: int64

In [332]:
df[1] = df[1].astype(float)
for i in num_col:
    df[i].fillna(df[i].mean(), inplace=True)

df[num_col].isnull().sum()

1     0
2     0
7     0
10    0
14    0
dtype: int64

## 7. Handling Missing Values

Impute the categorical data for both training and testing sets with mode value.

In [333]:
cat_col=[0, 3, 4, 5, 6, 8, 9,12]
df[cat_col].isnull().sum()

0     12
3      6
4      6
5      9
6      9
8      0
9      0
12     0
dtype: int64

In [334]:
for i in cat_col:
    df[i].fillna(df[i].mode().index[0], inplace=True)

print(df.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64


## 8. Encoding

The columns 0, 3, 4, 5, 6, 8, 9, and 12 are categorical, there are several methods we can use to encode the categorical columns. One of the method called get_dummies().

Use get_dummies() function to convert the categorical columns to a numerical columns (for training the machine learning algorithms).

Do not forget to convert both training and testing sets.

In [335]:
cat_col=[0, 3, 4, 5, 6, 8, 9,12]

df = pd.get_dummies(df, columns=cat_col, dtype=int)
df

Unnamed: 0,1,2,7,10,14,15,0_0,0_a,0_b,3_0,...,6_o,6_v,6_z,8_f,8_t,9_f,9_t,12_g,12_p,12_s
0,30.83,0.000,1.25,1,0,+,0,0,1,0,...,0,1,0,0,1,0,1,1,0,0
1,58.67,4.460,3.04,6,560,+,0,1,0,0,...,0,0,0,0,1,0,1,1,0,0
2,24.50,0.500,1.50,0,824,+,0,1,0,0,...,0,0,0,0,1,1,0,1,0,0
3,27.83,1.540,3.75,5,3,+,0,0,1,0,...,0,1,0,0,1,0,1,1,0,0
4,20.17,5.625,1.71,0,0,+,0,0,1,0,...,0,1,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,21.08,10.085,1.25,0,0,-,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
686,22.67,0.750,2.00,2,394,-,0,1,0,0,...,0,1,0,1,0,0,1,1,0,0
687,25.25,13.500,2.00,1,1,-,0,1,0,0,...,0,0,0,1,0,0,1,1,0,0
688,17.92,0.205,0.04,0,750,-,0,0,1,0,...,0,1,0,1,0,1,0,1,0,0


## 9. Split into features and target

X_train and y_train will take 462 rows.
X_test and y_test will take 228 rows.

In [336]:
df.columns = df.columns.astype(str)

In [337]:
X = df.drop(['15'], axis=1)
y = df[['15']]

X_train, X_test, y_train, y_test = train_test_split(X,y , random_state=42,test_size=0.33, shuffle=True)
print('X_train : ')
print(X_train.shape)
 
print('X_test : ')
print(X_test.shape)
 
print('y_train : ')
print(y_train.shape)
 
print('y_test : ')
print(y_test.shape)

X_train : 
(462, 48)
X_test : 
(228, 48)
y_train : 
(462, 1)
y_test : 
(228, 1)


## 10. Normalization

In [338]:
scaler = MinMaxScaler()
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

## 11. Train a Logistic Regression

In [339]:
model1= LogisticRegression()
model1.fit(rescaledX_train, y_train)
y_pred= model1.predict(rescaledX_test)

## 12. Make predictions and evaluate the Logistic Regression Model

In [340]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1score = f1_score(y_test, y_pred, average='micro')

print('precision', precision)
print('Recall', recall)
print("F1 Score", f1score)
print(f"Accuracy", accuracy)

precision 0.8421052631578947
Recall 0.8421052631578947
F1 Score 0.8421052631578947
Accuracy 0.8421052631578947


## 13. Repeat the steps 11 and 12 for SVM, DT, and RF

## SVM

In [341]:
model2 = svm.SVC(kernel='linear')
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

In [342]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1score = f1_score(y_test, y_pred, average='micro')

print('precision', precision)
print('Recall', recall)
print("F1 Score", f1score)
print(f"Accuracy", accuracy)

precision 0.8333333333333334
Recall 0.8333333333333334
F1 Score 0.8333333333333334
Accuracy 0.8333333333333334


## DT

In [343]:
model3 = DecisionTreeClassifier()
model3 = model3.fit(X_train,y_train)
y_pred = model3.predict(X_test)

In [344]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1score = f1_score(y_test, y_pred, average='micro')

print('precision', precision)
print('Recall', recall)
print("F1 Score", f1score)
print(f"Accuracy", accuracy)

precision 0.8157894736842105
Recall 0.8157894736842105
F1 Score 0.8157894736842104
Accuracy 0.8157894736842105


## RF

In [345]:
model4 = RandomForestClassifier()
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)

In [346]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1score = f1_score(y_test, y_pred, average='micro')

print('precision', precision)
print('Recall', recall)
print("F1 Score", f1score)
print(f"Accuracy", accuracy)

precision 0.8728070175438597
Recall 0.8728070175438597
F1 Score 0.8728070175438597
Accuracy 0.8728070175438597
