In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

## 1. Credit card applications
<p>Commercial banks receive <em>a lot</em> of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, just like the real banks do.</p>
<p><img src="https://assets.datacamp.com/production/project_558/img/credit_card.jpg" alt="Credit card being held in hand"></p>
<p>We'll use the <a href="http://archive.ics.uci.edu/ml/datasets/credit+approval">Credit Card Approval dataset</a> from the UCI Machine Learning Repository.

## 2. Import Pandas

1. Import pandas and alias it as pd
2. Load the dataset cc_approvals.data into a cc_apps dataframe.
    - Set the header argument to None.
3. Print the first five rows.
4. Drop the columns 11 and 13.

In [2]:
import pandas as pd 
df = pd.read_csv("cc_approvals.data")
df.columns = ["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","A11","A12","A13","A14","A15","A16"]
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [3]:
df = df.drop(columns=["A12","A14"])
df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A13,A15,A16
0,a,58.67,4.460,u,g,q,h,3.04,t,t,6,g,560,+
1,a,24.50,0.500,u,g,q,h,1.50,t,f,0,g,824,+
2,b,27.83,1.540,u,g,w,v,3.75,t,t,5,g,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+
4,b,32.08,4.000,u,g,m,v,2.50,t,f,0,g,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,21.08,10.085,y,p,e,h,1.25,f,f,0,g,0,-
685,a,22.67,0.750,u,g,c,v,2.00,f,t,2,g,394,-
686,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,g,1,-
687,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,g,750,-


## 3. Explore the dataset

1. Print the basic statistics.
2. Print the information of the dataset.
3. Print the last 17 rows.

In [4]:
df.describe(include="all").round()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A13,A15,A16
count,689,689,689.0,689,689,689,689,689.0,689,689,689.0,689,689.0,689
unique,3,349,,4,4,15,10,,2,2,,3,,2
top,b,?,,u,g,c,v,,t,f,,g,,-
freq,467,12,,518,518,137,398,,360,395,,624,,383
mean,,,5.0,,,,,2.0,,,2.0,,1019.0,
std,,,5.0,,,,,3.0,,,5.0,,5214.0,
min,,,0.0,,,,,0.0,,,0.0,,0.0,
25%,,,1.0,,,,,0.0,,,0.0,,0.0,
50%,,,3.0,,,,,1.0,,,0.0,,5.0,
75%,,,7.0,,,,,3.0,,,3.0,,396.0,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      689 non-null    object 
 1   A2      689 non-null    object 
 2   A3      689 non-null    float64
 3   A4      689 non-null    object 
 4   A5      689 non-null    object 
 5   A6      689 non-null    object 
 6   A7      689 non-null    object 
 7   A8      689 non-null    float64
 8   A9      689 non-null    object 
 9   A10     689 non-null    object 
 10  A11     689 non-null    int64  
 11  A13     689 non-null    object 
 12  A15     689 non-null    int64  
 13  A16     689 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 75.5+ KB


In [6]:
df.tail(17)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A13,A15,A16
672,?,29.5,2.0,y,p,e,h,2.0,f,f,0,g,17,-
673,a,37.33,2.5,u,g,i,h,0.21,f,f,0,g,246,-
674,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,g,237,-
675,a,30.58,10.665,u,g,q,h,0.085,f,t,12,g,3,-
676,b,19.42,7.25,u,g,m,v,0.04,f,t,1,g,1,-
677,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,g,50,-
678,a,20.08,1.25,u,g,c,v,0.0,f,f,0,g,0,-
679,b,19.5,0.29,u,g,k,v,0.29,f,f,0,g,364,-
680,b,27.83,1.0,y,p,d,h,3.0,f,f,0,g,537,-
681,b,17.08,3.29,u,g,i,v,0.335,f,f,0,g,2,-


## 4. Train Test Split

Do not split the dataset into X and y, just split the original dataset.

random_state=42

test_size=0.33

In [7]:
from sklearn.model_selection import train_test_split
X= df.loc[:,:'A15']
y = df.loc[:,["A16"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 5. Handling Missing Values

Convert any '?' to a NaN value from both training and testing sets.

In [8]:
import numpy as np
x =  ["A1","A2","A4","A5","A6","A7"]
for i in x:
    X_train[i].loc[(X_train[i]=="?")]=np.NaN
X_train.isna().sum()

A1     10
A2      8
A3      0
A4      6
A5      6
A6      8
A7      8
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
dtype: int64

In [9]:
for i in x:
    X_test[i].loc[(X_test[i]=="?")]=np.NaN
X_test.isna().sum()

A1     2
A2     4
A3     0
A4     0
A5     0
A6     1
A7     1
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

## 6. Handling Missing Values
Impute the numerical data for both training and testing sets with mean value.

In [10]:
X_test["A2"].fillna(X_test["A2"].mean,inplace=True)
X_test.isna().sum()

A1     2
A2     0
A3     0
A4     0
A5     0
A6     1
A7     1
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

In [11]:
X_train["A2"].fillna(X_train["A2"].mean,inplace=True)
X_train.isna().sum()

A1     10
A2      0
A3      0
A4      6
A5      6
A6      8
A7      8
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
dtype: int64

## 7. Handling Missing Values

Impute the categorical data for both training and testing sets with mode value.

In [12]:
x =  ["A1","A4","A5","A6","A7"]
for i in x:
    X_train[i].fillna(X_train[i].mod,inplace=True)
X_train.isna().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

In [13]:
x =  ["A1","A4","A5","A6","A7"]
for i in x:
    X_test[i].fillna(X_test[i].mod,inplace=True)
X_test.isna().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

## 8. Encoding

The columns 0, 3, 4, 5, 6, 8, 9, and 12 are categorical, there are several methods we can use to encode the categorical columns. One of the method called get_dummies().

Use get_dummies() function to convert the categorical columns to a numerical columns (for training the machine learning algorithms).

Do not forget to convert both training and testing sets.

In [14]:
x =  ["A1","A2","A4","A5","A6","A7"]
for i in x:
    df[i].loc[(df[i]=="?")]=np.NaN
def convert_to_float(value):
    try:
        return float(value)
    except:
        return float(np.NaN) 

df["A2"] = df["A2"].apply(convert_to_float)
df.isna().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
A16     0
dtype: int64

In [15]:
mean_v =df["A2"].mean()
print(mean_v)
df["A2"].fillna(value =mean_v,inplace=True)
df.isna().sum()

31.56926144756278


A1     12
A2      0
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
A16     0
dtype: int64

In [16]:
x =  ["A1","A4","A5","A6","A7"]
for i in x:
    mode_v =df[i].mode()
    print(mode_v[0])
    df[i].fillna(value=mode_v[0],inplace=True)
df.isna().sum()

b
u
g
c
v


A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
A16    0
dtype: int64

In [17]:
s = df.drop(columns=["A16","A2","A3","A8","A11","A15"])
encoded = pd.get_dummies(s,dtype=int)
prefix_list = ["A1", "A4", "A5", "A6", "A7", "A9", "A10", "A13"]
q = df.drop(columns=prefix_list)
q["A1_b"]=encoded["A1_b"]
dff = pd.merge(encoded,q)
dff

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A10_t,A13_g,A13_p,A13_s,A2,A3,A8,A11,A15,A16
0,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,58.67,4.460,3.040,6,560,+
1,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,24.50,0.500,1.500,0,824,+
2,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,22.92,11.585,0.040,0,1349,+
3,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,38.25,6.000,1.000,0,0,+
4,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,45.83,10.500,5.000,7,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273536,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,36.42,0.750,0.585,0,3,-
273537,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,40.58,3.290,3.500,0,0,-
273538,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,21.08,10.085,1.250,0,0,-
273539,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,17.92,0.205,0.040,0,750,-


In [18]:
from sklearn.model_selection import train_test_split
X= dff.loc[:,:'A15']
y = dff.loc[:,["A16"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 9. Split into features and target

X_train and y_train will take 462 rows.
X_test and y_test will take 228 rows.

In [19]:
X_train = X_train.head(462)
y_train= y_train.head(462)
X_test = X_test.head(228)
y_test = y_test.head(228)

## 10. Normalization

In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler().fit(X_test)
scaled_data = scaler.transform(X_test)
c =dff.columns.tolist()
c.pop()
X_test_n = pd.DataFrame(scaled_data,columns=c)
X_test_n

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A10_f,A10_t,A13_g,A13_p,A13_s,A2,A3,A8,A11,A15
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.546103,0.163750,0.035088,0.000000,0.00000
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.248826,0.125000,0.122807,0.044776,0.00000
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.250329,0.464286,0.017544,0.000000,0.00000
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.000000,0.321429,0.140351,0.014925,0.00006
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.134460,0.089286,0.002982,0.000000,0.04208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.126761,0.375000,0.046842,0.000000,0.00000
224,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.233052,0.178571,0.385965,0.000000,0.00000
225,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.120376,0.392857,0.070175,0.014925,0.00278
226,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.402066,0.232143,0.149123,0.179104,0.00000


In [21]:
scaler1  = MinMaxScaler().fit(X_train)
scaled_data = scaler1.transform(X_train)
c =dff.columns.tolist()
c.pop()
X_train_n = pd.DataFrame(scaled_data,columns=c)
X_train_n

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A10_f,A10_t,A13_g,A13_p,A13_s,A2,A3,A8,A11,A15
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.179841,0.076716,0.08325,0.014925,0.000639
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.235397,0.158816,0.02500,0.000000,0.000000
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.294921,0.112158,0.06250,0.000000,0.000000
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.282845,0.001795,0.21250,0.000000,0.000000
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.206349,0.050471,0.06250,0.000000,0.169346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.064762,0.493495,0.05000,0.164179,0.095893
458,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.461587,0.056079,0.69375,0.014925,0.003580
459,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.969524,0.852400,0.00200,0.029851,0.011219
460,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.197143,0.011216,0.00000,0.000000,0.000000


In [22]:
y_train = y_train.replace({'-':0, '+':1})
y_test= y_test.replace({'-':0, '+':1})
y_test

Unnamed: 0,A16
16707,0
185484,0
229437,0
266077,0
116674,0
...,...
81100,1
261729,1
96410,1
270291,1


## 11. Train a Logistic Regression

In [56]:
logmodel = LogisticRegression()
logmodel

In [57]:
logmodel.fit(X_train_n,y_train)

## 12. Make predictions and evaluate the Logistic Regression Model

In [58]:
y_p=logmodel.predict(X_test_n)

In [59]:
from sklearn.metrics import \
     classification_report, confusion_matrix,\
     accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [60]:
print("score of the training data :",logmodel.score(X_train_n,y_train))
print("score of the testing data :",logmodel.score(X_test_n,y_test))
print(f'The accuracy of the model : {accuracy_score(y_test, y_p)}')
print(f'The Sensitivity of the model : {recall_score(y_test, y_p)}')
Specificity= 19/(19+2)
print(f'The Specificity of the model : {Specificity}')
print(f'The AUC-ROC of the model : {roc_auc_score(y_test, y_p)}')

score of the training data : 0.7813852813852814
score of the testing data : 0.7280701754385965
The accuracy of the model : 0.7280701754385965
The Sensitivity of the model : 0.44565217391304346
The Specificity of the model : 0.9047619047619048
The AUC-ROC of the model : 0.6823849104859335


## 13. Repeat the steps 11 and 12 for SVM, DT, and RF

# SVM

In [61]:
from sklearn import svm
clf=svm.SVC(kernel='linear', C=100)

In [62]:
clf.fit(X_train_n, y_train)

In [63]:
y_p=clf.predict(X_test_n)

In [64]:
print("score of the training data :",clf.score(X_train_n,y_train))
print("score of the testing data :",clf.score(X_test_n,y_test))
print(f'The accuracy of the model : {accuracy_score(y_test, y_p)}')
print(f'The Sensitivity of the model : {recall_score(y_test, y_p)}')
Specificity= 19/(19+2)
print(f'The Specificity of the model : {Specificity}')
print(f'The AUC-ROC of the model : {roc_auc_score(y_test, y_p)}')

score of the training data : 0.8095238095238095
score of the testing data : 0.7894736842105263
The accuracy of the model : 0.7894736842105263
The Sensitivity of the model : 0.5869565217391305
The Specificity of the model : 0.9047619047619048
The AUC-ROC of the model : 0.7567135549872123


# DT

In [65]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_n, y_train)
y_p = dt.predict(X_test_n)

In [66]:
print("score of the training data :",dt.score(X_train_n,y_train))
print("score of the testing data :",dt.score(X_test_n,y_test))
print(f'The accuracy of the model : {accuracy_score(y_test, y_p)}')
print(f'The Sensitivity of the model : {recall_score(y_test, y_p)}')
Specificity= 19/(19+2)
print(f'The Specificity of the model : {Specificity}')
print(f'The AUC-ROC of the model : {roc_auc_score(y_test, y_p)}')

score of the training data : 1.0
score of the testing data : 0.7149122807017544
The accuracy of the model : 0.7149122807017544
The Sensitivity of the model : 0.6413043478260869
The Specificity of the model : 0.9047619047619048
The AUC-ROC of the model : 0.703005115089514


# RF

In [67]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_n, y_train)
y_p = clf.predict(X_test_n)

In [68]:
print("score of the training data :",rf.score(X_train_n,y_train))
print("score of the testing data :",rf.score(X_test_n,y_test))
print(f'The accuracy of the model : {accuracy_score(y_test, y_p)}')
print(f'The Sensitivity of the model : {recall_score(y_test, y_p)}')
Specificity= 19/(19+2)
print(f'The Specificity of the model : {Specificity}')
print(f'The AUC-ROC of the model : {roc_auc_score(y_test, y_p)}')

score of the training data : 1.0
score of the testing data : 0.8026315789473685
The accuracy of the model : 0.7894736842105263
The Sensitivity of the model : 0.5869565217391305
The Specificity of the model : 0.9047619047619048
The AUC-ROC of the model : 0.7567135549872123
