## 1. Credit card applications
<p>Commercial banks receive <em>a lot</em> of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, just like the real banks do.</p>
<p><img src="https://assets.datacamp.com/production/project_558/img/credit_card.jpg" alt="Credit card being held in hand"></p>
<p>We'll use the <a href="http://archive.ics.uci.edu/ml/datasets/credit+approval">Credit Card Approval dataset</a> from the UCI Machine Learning Repository.

## 2. Import Pandas

1. Import pandas and alias it as pd
2. Load the dataset cc_approvals.data into a cc_apps dataframe.
    - Set the header argument to None.
3. Print the first five rows.
4. Drop the columns 11 and 13.

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
cc_apps=pd.read_csv('datasets\cc_approvals.data')
cc_apps.head(20)

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+
5,b,33.17,1.04,u,g,r,h,6.5,t,f,0,t,g,164,31285,+
6,a,22.92,11.585,u,g,cc,v,0.04,t,f,0,f,g,80,1349,+
7,b,54.42,0.5,y,p,k,h,3.96,t,f,0,f,g,180,314,+
8,b,42.5,4.915,y,p,w,v,3.165,t,f,0,t,g,52,1442,+
9,b,22.08,0.83,u,g,c,h,2.165,f,f,0,t,g,128,0,+


In [3]:
cc_apps.columns=[ 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
cc_apps

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
1,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
2,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
4,b,32.08,4.000,u,g,m,v,2.50,t,f,0,t,g,00360,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
685,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
686,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
687,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


# I Started with A1
#### [ 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']

In [4]:
cc_apps=cc_apps.drop(columns=['A12' ,'A14'])
cc_apps

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A13,A15,A16
0,a,58.67,4.460,u,g,q,h,3.04,t,t,6,g,560,+
1,a,24.50,0.500,u,g,q,h,1.50,t,f,0,g,824,+
2,b,27.83,1.540,u,g,w,v,3.75,t,t,5,g,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+
4,b,32.08,4.000,u,g,m,v,2.50,t,f,0,g,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,21.08,10.085,y,p,e,h,1.25,f,f,0,g,0,-
685,a,22.67,0.750,u,g,c,v,2.00,f,t,2,g,394,-
686,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,g,1,-
687,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,g,750,-


## 3. Explore the dataset

1. Print the basic statistics.
2. Print the information of the dataset.
3. Print the last 17 rows.

In [5]:
cc_apps.describe().round()

Unnamed: 0,A3,A8,A11,A15
count,689.0,689.0,689.0,689.0
mean,5.0,2.0,2.0,1019.0
std,5.0,3.0,5.0,5214.0
min,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0
50%,3.0,1.0,0.0,5.0
75%,7.0,3.0,3.0,396.0
max,28.0,28.0,67.0,100000.0


In [6]:
cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      689 non-null    object 
 1   A2      689 non-null    object 
 2   A3      689 non-null    float64
 3   A4      689 non-null    object 
 4   A5      689 non-null    object 
 5   A6      689 non-null    object 
 6   A7      689 non-null    object 
 7   A8      689 non-null    float64
 8   A9      689 non-null    object 
 9   A10     689 non-null    object 
 10  A11     689 non-null    int64  
 11  A13     689 non-null    object 
 12  A15     689 non-null    int64  
 13  A16     689 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 75.5+ KB


In [7]:
cc_apps.tail(17)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A13,A15,A16
672,?,29.5,2.0,y,p,e,h,2.0,f,f,0,g,17,-
673,a,37.33,2.5,u,g,i,h,0.21,f,f,0,g,246,-
674,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,g,237,-
675,a,30.58,10.665,u,g,q,h,0.085,f,t,12,g,3,-
676,b,19.42,7.25,u,g,m,v,0.04,f,t,1,g,1,-
677,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,g,50,-
678,a,20.08,1.25,u,g,c,v,0.0,f,f,0,g,0,-
679,b,19.5,0.29,u,g,k,v,0.29,f,f,0,g,364,-
680,b,27.83,1.0,y,p,d,h,3.0,f,f,0,g,537,-
681,b,17.08,3.29,u,g,i,v,0.335,f,f,0,g,2,-


## 4. Train Test Split

Do not split the dataset into X and y, just split the original dataset.

random_state=42

test_size=0.33

In [8]:
X= cc_apps.drop(columns=['A16'])
y = cc_apps.loc[:,["A16"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 5. Handling Missing Values

Convert any '?' to a NaN value from both training and testing sets.

In [9]:
import numpy as np

In [10]:
cols=['A1', 'A2', 'A4', 'A5', 'A6', 'A7']
for i in cols:
    X_train[i].loc[(X_train[i]=='?')]=np.NaN
X_train.isna().sum()

A1     10
A2      8
A3      0
A4      6
A5      6
A6      8
A7      8
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
dtype: int64

In [11]:
for i in cols:
    X_test[i].loc[(X_test[i]=='?')]=np.NaN
X_test.isna().sum()

A1     2
A2     4
A3     0
A4     0
A5     0
A6     1
A7     1
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

## 6. Handling Missing Values

Impute the numerical data for both training and testing sets with mean value.

In [12]:
X_train['A2'].fillna(X_train['A2'].mean,inplace=True)
X_train.isna().sum()

A1     10
A2      0
A3      0
A4      6
A5      6
A6      8
A7      8
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
dtype: int64

In [13]:
X_test['A2'].fillna(X_test['A2'].mean,inplace=True)
X_test.isna().sum()

A1     2
A2     0
A3     0
A4     0
A5     0
A6     1
A7     1
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

## 7. Handling Missing Values

Impute the categorical data for both training and testing sets with mode value.

In [14]:
cols=['A1','A4', 'A5', 'A6', 'A7']
for i in cols:
    X_train[i].fillna(X_train[i].mode,inplace=True)
X_train.isna().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

In [15]:
for i in cols:
    X_test[i].fillna(X_test[i].mode,inplace=True)
X_test.isna().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
dtype: int64

## 8. Encoding

The columns 0, 3, 4, 5, 6, 8, 9, and 12 are categorical, there are several methods we can use to encode the categorical columns. One of the method called get_dummies().

Use get_dummies() function to convert the categorical columns to a numerical columns (for training the machine learning algorithms).

Do not forget to convert both training and testing sets.

In [16]:
def convert_to_float(value):
    try:
        return float(value)
    except:
        return float(np.NaN) 


In [17]:
cols =  ["A1","A2","A4","A5","A6","A7"]
for i in cols:
    cc_apps[i].loc[(cc_apps[i]=="?")]=np.NaN

cc_apps["A2"] = cc_apps["A2"].apply(convert_to_float)
cc_apps.isna().sum()

A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
A16     0
dtype: int64

In [18]:
mean_v =cc_apps["A2"].mean()
print(mean_v)
cc_apps["A2"].fillna(value =mean_v,inplace=True)
cc_apps.isna().sum()

31.56926144756278


A1     12
A2      0
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A13     0
A15     0
A16     0
dtype: int64

In [19]:
x =  ["A1","A4","A5","A6","A7"]
for i in x:
    mode_v =cc_apps[i].mode()
    print(mode_v[0])
    cc_apps[i].fillna(value=mode_v[0],inplace=True)
cc_apps.isna().sum()

b
u
g
c
v


A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A13    0
A15    0
A16    0
dtype: int64

In [20]:
ccat=cc_apps.drop(columns=['A2','A3','A8','A11','A15','A16'])
encoded = pd.get_dummies(ccat,dtype=int)
prefix_list = ["A1", "A4", "A5", "A6", "A7", "A9", "A10", "A13"]
q = cc_apps.drop(columns=prefix_list)
q["A1_b"]=encoded["A1_b"]
dff = pd.merge(encoded,q)
dff

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A10_t,A13_g,A13_p,A13_s,A2,A3,A8,A11,A15,A16
0,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,58.67,4.460,3.040,6,560,+
1,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,24.50,0.500,1.500,0,824,+
2,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,22.92,11.585,0.040,0,1349,+
3,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,38.25,6.000,1.000,0,0,+
4,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,45.83,10.500,5.000,7,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273536,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,36.42,0.750,0.585,0,3,-
273537,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,40.58,3.290,3.500,0,0,-
273538,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,21.08,10.085,1.250,0,0,-
273539,0,1,0,1,0,1,0,0,0,1,...,0,1,0,0,17.92,0.205,0.040,0,750,-


In [21]:
dff['A16'] = np.where(dff['A16'] == '+', 1, 0).astype('int64')
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273541 entries, 0 to 273540
Data columns (total 44 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   A1_a    273541 non-null  int32  
 1   A1_b    273541 non-null  int32  
 2   A4_l    273541 non-null  int32  
 3   A4_u    273541 non-null  int32  
 4   A4_y    273541 non-null  int32  
 5   A5_g    273541 non-null  int32  
 6   A5_gg   273541 non-null  int32  
 7   A5_p    273541 non-null  int32  
 8   A6_aa   273541 non-null  int32  
 9   A6_c    273541 non-null  int32  
 10  A6_cc   273541 non-null  int32  
 11  A6_d    273541 non-null  int32  
 12  A6_e    273541 non-null  int32  
 13  A6_ff   273541 non-null  int32  
 14  A6_i    273541 non-null  int32  
 15  A6_j    273541 non-null  int32  
 16  A6_k    273541 non-null  int32  
 17  A6_m    273541 non-null  int32  
 18  A6_q    273541 non-null  int32  
 19  A6_r    273541 non-null  int32  
 20  A6_w    273541 non-null  int32  
 21  A6_x    27

In [22]:
# for i in range(len(dff['A16'])):
#     if dff['A16'][i] == '+':
#         dff['A16'][i] = '1'
#     else:
#         dff['A16'][i] = '0'
# dff['A16'] = dff['A16'].astype('int64')
# dff.info()


In [23]:
X= dff.drop(columns=['A16'])
y = dff.loc[:,["A16"]]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## 9. Split into features and target

X_train and y_train will take 462 rows.
X_test and y_test will take 228 rows.

In [24]:
X_train = X_train.head(462)
y_train = y_train.head(462)
X_test = X_test.head(228)
y_test = y_test.head(228)

## 10. Normalization

In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler().fit(X_train)
scaled_data = scaler.transform(X_train)
c =dff.columns.tolist()
c.pop()
scaled_X_train = pd.DataFrame(scaled_data,columns=c)
scaled_X_train

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A10_f,A10_t,A13_g,A13_p,A13_s,A2,A3,A8,A11,A15
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.179841,0.076716,0.08325,0.014925,0.000639
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.235397,0.158816,0.02500,0.000000,0.000000
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.294921,0.112158,0.06250,0.000000,0.000000
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.282845,0.001795,0.21250,0.000000,0.000000
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.206349,0.050471,0.06250,0.000000,0.169346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.064762,0.493495,0.05000,0.164179,0.095893
458,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.461587,0.056079,0.69375,0.014925,0.003580
459,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.969524,0.852400,0.00200,0.029851,0.011219
460,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.197143,0.011216,0.00000,0.000000,0.000000


In [35]:
scaler  = MinMaxScaler().fit(X_test)
scaled_data = scaler.transform(X_test)
c =dff.columns.tolist()
c.pop()
scaled_X_test = pd.DataFrame(scaled_data,columns=c)
scaled_X_test

Unnamed: 0,A1_a,A1_b,A4_l,A4_u,A4_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A10_f,A10_t,A13_g,A13_p,A13_s,A2,A3,A8,A11,A15
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.546103,0.163750,0.035088,0.000000,0.00000
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.248826,0.125000,0.122807,0.044776,0.00000
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.250329,0.464286,0.017544,0.000000,0.00000
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.000000,0.321429,0.140351,0.014925,0.00006
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.134460,0.089286,0.002982,0.000000,0.04208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.126761,0.375000,0.046842,0.000000,0.00000
224,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.233052,0.178571,0.385965,0.000000,0.00000
225,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.120376,0.392857,0.070175,0.014925,0.00278
226,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.402066,0.232143,0.149123,0.179104,0.00000


## 11. Train a Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model

In [37]:
model.fit(scaled_X_train, y_train)

## 12. Make predictions and evaluate the Logistic Regression Model

In [38]:
from sklearn.metrics import \
     accuracy_score, precision_score, recall_score, f1_score,roc_auc_score

In [39]:
pred = model.predict(scaled_X_test)

In [40]:
print('accuracy: ', accuracy_score(y_test,pred))
print('Recall: ', recall_score(y_test,pred))
print('Precision: ', precision_score(y_test,pred))
print('F1 score: ', f1_score(y_test,pred))
print('AUC-ROC: ', roc_auc_score(y_test,pred))

accuracy:  0.7280701754385965
Recall:  0.44565217391304346
Precision:  0.7884615384615384
F1 score:  0.5694444444444444
AUC-ROC:  0.6823849104859335


## 13. Repeat the steps 11 and 12 for SVM, DT, and RF

### SVM
___

In [41]:
from sklearn.svm import SVC
svm=SVC(kernel='linear',C=100)

In [42]:
svm.fit(scaled_X_train, y_train)

In [43]:
pred = svm.predict(scaled_X_test)

In [44]:
print('accuracy: ', accuracy_score(y_test,pred))
print('Recall: ', recall_score(y_test,pred))
print('Precision: ', precision_score(y_test,pred))
print('F1 score: ', f1_score(y_test,pred))
print('AUC-ROC: ', roc_auc_score(y_test,pred))

accuracy:  0.7894736842105263
Recall:  0.5869565217391305
Precision:  0.84375
F1 score:  0.6923076923076924
AUC-ROC:  0.7567135549872123


___
# DT
___

In [45]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(scaled_X_train, y_train)


In [46]:
pred = DT.predict(scaled_X_test)

In [47]:
print('accuracy: ', accuracy_score(y_test,pred))
print('Recall: ', recall_score(y_test,pred))
print('Precision: ', precision_score(y_test,pred))
print('F1 score: ', f1_score(y_test,pred))
print('AUC-ROC: ', roc_auc_score(y_test,pred))

accuracy:  0.7280701754385965
Recall:  0.6739130434782609
Precision:  0.6595744680851063
F1 score:  0.6666666666666667
AUC-ROC:  0.7193094629156009


___
# RF
___

In [48]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(scaled_X_train, y_train)

In [49]:
pred = RF.predict(scaled_X_test)

In [50]:
print('accuracy: ', accuracy_score(y_test,pred))
print('Recall: ', recall_score(y_test,pred))
print('Precision: ', precision_score(y_test,pred))
print('F1 score: ', f1_score(y_test,pred))
print('AUC-ROC: ', roc_auc_score(y_test,pred))

accuracy:  0.7982456140350878
Recall:  0.6739130434782609
Precision:  0.7948717948717948
F1 score:  0.7294117647058824
AUC-ROC:  0.7781329923273657
