In [0]:
import pandas as pd

cc_apps = pd.read_csv("/content/cc_approvals.data")

cc_apps.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


**Statistical summary of the data**

In [0]:
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print('\n')

#printing dataframe information
cc_apps_info = cc_apps.info()

print(cc_apps_info)

print('\n')

#inspecting missing values in the dataset
print(cc_apps.tail(18))

                0        1.25          01            0.1
count  689.000000  689.000000  689.000000     689.000000
mean     4.765631    2.224819    2.402032    1018.862119
std      4.978470    3.348739    4.866180    5213.743149
min      0.000000    0.000000    0.000000       0.000000
25%      1.000000    0.165000    0.000000       0.000000
50%      2.750000    1.000000    0.000000       5.000000
75%      7.250000    2.625000    3.000000     396.000000
max     28.000000   28.500000   67.000000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
b        689 non-null object
30.83    689 non-null object
0        689 non-null float64
u        689 non-null object
g        689 non-null object
w        689 non-null object
v        689 non-null object
1.25     689 non-null float64
t        689 non-null object
t.1      689 non-null object
01       689 non-null int64
f        689 non-null object
g.1      689 non-null object
00

**Preprocessing - part 1**

Handling missing values

In [0]:
print(cc_apps.isnull().values.sum())

#replacing '?' with np.NaN for future modification
#importing numpy library
import numpy as np

cc_apps = cc_apps.replace('?',np.NaN)

#verifying replace operation with NaN

cc_apps.tail(18)

0


Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
671,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,t,g,240,117,-
672,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
673,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
674,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
675,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
676,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
677,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
678,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
679,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
680,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-


**Preprocessing part 2: Imputing missing values with mean imputation**




In [0]:
cc_apps.fillna(cc_apps.mean(), inplace=True)

#counting the number of NaN values in the dataset

print("there are "+ str(cc_apps.isnull().values.sum())+" NaN values in the dataset")

there are 67 NaN values in the dataset


**Filling missing values with recent values by each column by iteration method**

In [0]:
for col in cc_apps.columns:
  if cc_apps[col].dtype == 'object':
    cc_apps[col] = cc_apps[col].fillna(cc_apps[col].value_counts().index[0])

#inspecting missing values after filling operation

print("there are "+ str(cc_apps.isnull().values.sum())+" missing values in the dataset")


there are 0 missing values in the dataset


**Preprocessing part-3: Label encoding the non-numeric data i.e. converting non-numeric data into numeric form**

In [0]:
#importing LabelEncoder from sklearn

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for col in cc_apps.columns:
  if cc_apps[col].dtype == 'object':
    cc_apps[col] = encoder.fit_transform(cc_apps[col])




**Splitting the data into Training set and Testing set**

In [0]:
from sklearn.model_selection import train_test_split
X = cc_apps[:,0:13]
y = cc_apps[:,13]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33, random_state = 42)

**Importing Logistic Regression model for classification**

In [0]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

#fitting training data in logistic regression model

model.fit(X_train, y_train)

# testing the accurary of the model with score() method

score = model.score(X_test, y_test)

print("accuracy of model is "+ str(score))


accuracy of model is 0.8552631578947368




**Making prediction with test data and plotting confusion_matrix**

In [0]:
#making prediction with tesing data and plotting confusion matrix

from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)

print(conf_mat)

[[ 95   5]
 [ 28 100]]
