# DATA DESCRIPTION

In [None]:

1. Title: Credit Approval

2. Sources: 
    (confidential)
    Submitted by quinlan@cs.su.oz.au

3.  Past Usage:

    See Quinlan,
    * "Simplifying decision trees", Int J Man-Machine Studies 27,
      Dec 1987, pp. 221-234.
    * "C4.5: Programs for Machine Learning", Morgan Kaufmann, Oct 1992
  
4.  Relevant Information:

    This file concerns credit card applications.  All attribute names
    and values have been changed to meaningless symbols to protect
    confidentiality of the data.
  
    This dataset is interesting because there is a good mix of
    attributes -- continuous, nominal with small numbers of
    values, and nominal with larger numbers of values.  There
    are also a few missing values.
  
5.  Number of Instances: 690

6.  Number of Attributes: 15 + class attribute

7.  Attribute Information:

    A1:	b, a.
    A2:	continuous.
    A3:	continuous.
    A4:	u, y, l, t.
    A5:	g, p, gg.
    A6:	c, d, cc, i, j, k, m, r, q, w, x, e, aa, ff.
    A7:	v, h, bb, j, n, z, dd, ff, o.
    A8:	continuous.
    A9:	t, f.
    A10:	t, f.
    A11:	continuous.
    A12:	t, f.
    A13:	g, p, s.
    A14:	continuous.
    A15:	continuous.
    A16: +,-         (class attribute)

8.  Missing Attribute Values:
    37 cases (5%) have one or more missing values.  The missing
    values from particular attributes are:

    A1:  12
    A2:  12
    A4:   6
    A5:   6
    A6:   9
    A7:   9
    A14: 13

9.  Class Distribution
  
    +: 307 (44.5%)
    -: 383 (55.5%)



# IMPORTING BASIC LIBRARIES

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# READ DATA SET

In [48]:
data = pd.read_table(r"C:\Users\Pragya\Downloads\credit+approval\crx.data",delimiter=",")
data

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
1,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
2,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
4,b,32.08,4.000,u,g,m,v,2.50,t,f,0,t,g,00360,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
685,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
686,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
687,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


# EDA

**SHAPE**

In [6]:
data.shape

(689, 16)

**COLUMNS**

In [8]:
data.columns

Index(['b', '30.83', '0', 'u', 'g', 'w', 'v', '1.25', 't', 't.1', '01', 'f',
       'g.1', '00202', '0.1', '+'],
      dtype='object')

**describe**

In [10]:
data.describe()

Unnamed: 0,0,1.25,01,0.1
count,689.0,689.0,689.0,689.0
mean,4.765631,2.224819,2.402032,1018.862119
std,4.97847,3.348739,4.86618,5213.743149
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.25,2.625,3.0,396.0
max,28.0,28.5,67.0,100000.0


**INFO**

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   b       689 non-null    object 
 1   30.83   689 non-null    object 
 2   0       689 non-null    float64
 3   u       689 non-null    object 
 4   g       689 non-null    object 
 5   w       689 non-null    object 
 6   v       689 non-null    object 
 7   1.25    689 non-null    float64
 8   t       689 non-null    object 
 9   t.1     689 non-null    object 
 10  01      689 non-null    int64  
 11  f       689 non-null    object 
 12  g.1     689 non-null    object 
 13  00202   689 non-null    object 
 14  0.1     689 non-null    int64  
 15  +       689 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.3+ KB


**datatypes**

In [13]:
data.dtypes

b         object
30.83     object
0        float64
u         object
g         object
w         object
v         object
1.25     float64
t         object
t.1       object
01         int64
f         object
g.1       object
00202     object
0.1        int64
+         object
dtype: object

**ANAMOLY DETECTION**

In [16]:
for i in data.columns:
    print({i:data[i].unique()})


{'b': array(['a', 'b', '?'], dtype=object)}
{'30.83': array(['58.67', '24.50', '27.83', '20.17', '32.08', '33.17', '22.92',
       '54.42', '42.50', '22.08', '29.92', '38.25', '48.08', '45.83',
       '36.67', '28.25', '23.25', '21.83', '19.17', '25.00', '47.75',
       '27.42', '41.17', '15.83', '47.00', '56.58', '57.42', '42.08',
       '29.25', '42.00', '49.50', '36.75', '22.58', '27.25', '23.00',
       '27.75', '54.58', '34.17', '28.92', '29.67', '39.58', '56.42',
       '54.33', '41.00', '31.92', '41.50', '23.92', '25.75', '26.00',
       '37.42', '34.92', '34.25', '23.33', '23.17', '44.33', '35.17',
       '43.25', '56.75', '31.67', '23.42', '20.42', '26.67', '36.00',
       '25.50', '19.42', '32.33', '34.83', '38.58', '44.25', '44.83',
       '20.67', '34.08', '21.67', '21.50', '49.58', '27.67', '39.83', '?',
       '37.17', '25.67', '34.00', '49.00', '62.50', '31.42', '52.33',
       '28.75', '28.58', '22.50', '28.50', '37.50', '35.25', '18.67',
       '54.83', '40.92', '19.75

* ? is detected as anamoly  
* I am replacing ? with np.nan

# PREPROCESSING

**handling anamoly**

In [49]:
# replacing ? by nan
data.replace("?",np.nan,inplace=True)
data.isnull().sum()

b        12
30.83    12
0         0
u         6
g         6
w         9
v         9
1.25      0
t         0
t.1       0
01        0
f         0
g.1       0
00202    13
0.1       0
+         0
dtype: int64

**MISSING VALUES**

In [18]:
data.isnull().sum()

b        12
30.83    12
0         0
u         6
g         6
w         9
v         9
1.25      0
t         0
t.1       0
01        0
f         0
g.1       0
00202    13
0.1       0
+         0
dtype: int64

**HANDLING MISSING VALUES**

In [50]:
for x in data.columns:
    if data[x].dtype=='object' or data[x].dtype=='bool':      # for categorical
        data[x].fillna(data[x].mode()[0],inplace=True)
    elif data[x].dtype=='int64' or data[x].dtype=='float64':    # for numerical
        data[x].fillna(round(data[x].mean()),inplace=True)
 

In [20]:
data.isnull().sum()

b        0
30.83    0
0        0
u        0
g        0
w        0
v        0
1.25     0
t        0
t.1      0
01       0
f        0
g.1      0
00202    0
0.1      0
+        0
dtype: int64

**DISTRIBUTION OF TARGET VARIABLE**

In [51]:
data["+"].value_counts()

+
-    383
+    306
Name: count, dtype: int64

**CONVERTING CATEGORICAL DATA TO NUMERICAL DATA**

In [52]:
colname=[]
for x in data.columns:
    if data[x].dtype=='object':
        colname.append(x)
print(colname)



# appliying fit transform method to all the columns (transform_method) i.e categorical to numerical
# For preprocessing the data
from sklearn.preprocessing import LabelEncoder
 
le=LabelEncoder()
 
for x in colname:
    data[x]=le.fit_transform(data[x])



['b', '30.83', 'u', 'g', 'w', 'v', 't', 't.1', 'f', 'g.1', '00202', '+']


**CREATING X AND Y**

In [53]:
X = data.values[:,0:-1]
Y = data.values[:,-1]

In [23]:
X.shape

(689, 15)

In [25]:
Y.shape

(689,)

**SCALING**

In [54]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()     

scaler.fit(X)                
scaler.transform(X)       

array([[-1.51028222,  1.96283334, -0.06143521, ..., -0.31742431,
        -0.97490469, -0.08807405],
       [-1.51028222, -0.60757687, -0.85743812, ..., -0.31742431,
         0.85192895, -0.03740186],
       [ 0.6621279 , -0.21877533, -0.64838685, ..., -0.31742431,
        -0.5399443 , -0.1949847 ],
       ...,
       [-1.51028222, -0.52117653,  1.75570273, ..., -0.31742431,
         0.2429844 , -0.19536858],
       [ 0.6621279 , -1.35277983, -0.91673631, ..., -0.31742431,
         0.85192895, -0.05160543],
       [ 0.6621279 ,  0.54802772, -0.27953197, ..., -0.31742431,
        -1.2141329 , -0.19556052]])

**splitting data into train and test**

In [55]:
from sklearn.model_selection import train_test_split

# split the data into test and train
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state = 10)  


In [56]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(482, 15)
(207, 15)
(482,)
(207,)


# MODEL BUILDING

# LOGISTIC REGRESSION

**TRAINING PHASE**

In [57]:
from sklearn.linear_model import LogisticRegression
# create a model
classifier=LogisticRegression(random_state=10)

# fitting training data to the model...........(input is X and Y)
classifier.fit(X_train,Y_train)

# output is equation of liner regression 
print(classifier.intercept_)   
print(classifier.coef_)     

[0.26322805]
[[ 1.78322699e-01  2.53976522e-03  7.03504877e-02  5.60897571e-01
   5.72653126e-01  5.90069143e-03 -9.15478376e-02 -3.38244553e-01
  -1.09080247e+00 -1.65387866e-01 -2.09670011e-01  8.82597039e-02
  -4.55034016e-02  9.09430731e-03 -4.32511405e-04]]


In [58]:
# zipping to see kon se variable ka kya value aaya h
print(list(zip(data.columns[0:-1],classifier.coef_.ravel())))

[('b', 0.17832269860632252), ('30.83', 0.00253976522409084), ('0', 0.07035048771654451), ('u', 0.5608975713931001), ('g', 0.5726531261833715), ('w', 0.005900691434739444), ('v', -0.09154783763897341), ('1.25', -0.33824455335918646), ('t', -1.0908024674151906), ('t.1', -0.16538786614218232), ('01', -0.20967001103170924), ('f', 0.08825970393983461), ('g.1', -0.04550340156312907), ('00202', 0.009094307309444176), ('0.1', -0.00043251140473100956)]


**TESTING PHASE**

In [59]:
Y_pred=classifier.predict(X_test)
print(Y_pred)

[1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0.
 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0.
 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1.
 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1.
 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1.]


**PROBABILITY MATRIX**

In [60]:
Y_pred_prob = classifier.predict_proba(X_test)
#print(Y_pred_prob)

**COMPARING ACTUAL WITH PREDICTED**

In [None]:
#print(list(zip(Y_test,Y_pred)))

**EVALUATION PHASE**

In [61]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
# confusion matrix
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
print(classification_report(Y_test,Y_pred))

# accuracy_score
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[70 25]
 [13 99]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.84      0.74      0.79        95
         1.0       0.80      0.88      0.84       112

    accuracy                           0.82       207
   macro avg       0.82      0.81      0.81       207
weighted avg       0.82      0.82      0.81       207

Accuracy of the model:  0.8164251207729468


**TUNNING**

In [74]:
y_pred_prob = classifier.predict_proba(X_test)
#print(y_pred_prob)


y_pred_class = []
for value in y_pred_prob[:,1]:
    if value >0.4:                         # agar value 0.5 se jayada h to consider as 1
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)              # agar value 0.5 se kum h to consider as 0


from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
# confusion matrix
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,y_pred_class))


# accuracy_score
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)

[[ 61  34]
 [  7 105]]
Classification report: 
              precision    recall  f1-score   support

         0.0       0.90      0.64      0.75        95
         1.0       0.76      0.94      0.84       112

    accuracy                           0.80       207
   macro avg       0.83      0.79      0.79       207
weighted avg       0.82      0.80      0.80       207

Accuracy of the model:  0.8019323671497585


# DECISION TREE

In [75]:
#predicting using the Decision_Tree_Classifier
from sklearn.tree import DecisionTreeClassifier

model_DT=DecisionTreeClassifier(random_state=10, 
                                         criterion="gini")
#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

#fit the model on the data and predict the values
model_DT.fit(X_train,Y_train)
Y_pred=model_DT.predict(X_test)
#print(Y_pred)
#print(list(zip(Y_test,Y_pred)))

from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

model_DT.score(X_train,Y_train)

[[72 23]
 [18 94]]
0.8019323671497585
              precision    recall  f1-score   support

         0.0       0.80      0.76      0.78        95
         1.0       0.80      0.84      0.82       112

    accuracy                           0.80       207
   macro avg       0.80      0.80      0.80       207
weighted avg       0.80      0.80      0.80       207



1.0

**PRUNNING DECISION TREE**

In [89]:
#predicting using the Decision_Tree_Classifier
from sklearn.tree import DecisionTreeClassifier

model_DT=DecisionTreeClassifier(random_state=10, 
                                         criterion="gini",
                                         splitter="best", 
                                         min_samples_leaf=6,
                                         min_samples_split=5,
                                         max_depth=15, 
                                        #max_leaf_nodes=100,
                                         # max_features=0.6
                                         )
#min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

#fit the model on the data and predict the values
model_DT.fit(X_train,Y_train)
Y_pred=model_DT.predict(X_test)


from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[78 17]
 [17 95]]
0.8357487922705314
              precision    recall  f1-score   support

         0.0       0.82      0.82      0.82        95
         1.0       0.85      0.85      0.85       112

    accuracy                           0.84       207
   macro avg       0.83      0.83      0.83       207
weighted avg       0.84      0.84      0.84       207



# RANDOM FOREST

In [90]:
#predicting using the Random_Forest_Classifier
from sklearn.ensemble import RandomForestClassifier

model_RandomForest=RandomForestClassifier(n_estimators=100,
                                          random_state=10, bootstrap=True,
                                         n_jobs=-1)

#fit the model on the data and predict the values
model_RandomForest.fit(X_train,Y_train)

Y_pred=model_RandomForest.predict(X_test)


from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[85 10]
 [13 99]]
0.8888888888888888
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88        95
         1.0       0.91      0.88      0.90       112

    accuracy                           0.89       207
   macro avg       0.89      0.89      0.89       207
weighted avg       0.89      0.89      0.89       207



# EXTRA TREES

In [91]:
#predicting using the Extra_Trees_Classifier
from sklearn.ensemble import ExtraTreesClassifier

model_EXT=ExtraTreesClassifier(n_estimators=300, random_state=10, bootstrap=True)

#fit the model on the data and predict the values
model_EXT.fit(X_train,Y_train)

Y_pred=model_EXT.predict(X_test)



from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
#confusion matrix
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

[[85 10]
 [13 99]]
0.8888888888888888
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88        95
         1.0       0.91      0.88      0.90       112

    accuracy                           0.89       207
   macro avg       0.89      0.89      0.89       207
weighted avg       0.89      0.89      0.89       207



CONCLUSION:   
Out of all these Tuned logistic regression models give the hisggest accuaracy at required class
