In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



In [21]:
#Loading the dataset
credit_data = pd.read_csv('C:\credit-approval_csv.csv')
#Looking at the dataset
credit_data.head()
#
credit_description = credit_data.describe()
print(credit_description)

              Age        Debt  YearsEmployed  CreditScore      ZipCode  \
count  678.000000  690.000000     690.000000    690.00000   677.000000   
mean    31.568171    4.758725       2.223406      2.40000   184.014771   
std     11.957862    4.978163       3.346513      4.86294   173.806768   
min     13.750000    0.000000       0.000000      0.00000     0.000000   
25%     22.602500    1.000000       0.165000      0.00000    75.000000   
50%     28.460000    2.750000       1.000000      0.00000   160.000000   
75%     38.230000    7.207500       2.625000      3.00000   276.000000   
max     80.250000   28.000000      28.500000     67.00000  2000.000000   

              Income  
count     690.000000  
mean     1017.385507  
std      5210.102598  
min         0.000000  
25%         0.000000  
50%         5.000000  
75%       395.500000  
max    100000.000000  


In [22]:
credit_info = credit_data.info()
print(credit_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           678 non-null    object 
 1   Age              678 non-null    float64
 2   Debt             690 non-null    float64
 3   Married          684 non-null    object 
 4   BankCustomer     684 non-null    object 
 5   EducationLevel   681 non-null    object 
 6   Ethnicity        681 non-null    object 
 7   YearsEmployed    690 non-null    float64
 8   PriorDefault     690 non-null    object 
 9   Employed         690 non-null    object 
 10  CreditScore      690 non-null    int64  
 11  DriversLicense   690 non-null    object 
 12  Citizen          690 non-null    object 
 13  ZipCode          677 non-null    float64
 14  Income           690 non-null    int64  
 15  Approved         690 non-null    object 
 16  Approved_Status  690 non-null    object 
dtypes: float64(4), i

In [23]:
# Replace "?" with NaN
credit_data.replace('?', np.NaN, inplace = True)
#Replacing the misssing value by mean value
credit_data.fillna(credit_data.mean(), inplace=True)
# Convert Age to numeric
credit_data["Age"] = pd.to_numeric(credit_data["Age"])
print(credit_data.isnull().sum())

Gender             12
Age                 0
Debt                0
Married             6
BankCustomer        6
EducationLevel      9
Ethnicity           9
YearsEmployed       0
PriorDefault        0
Employed            0
CreditScore         0
DriversLicense      0
Citizen             0
ZipCode             0
Income              0
Approved            0
Approved_Status     0
dtype: int64


In [6]:
for col in credit_data.columns:
    # Check if the column is of object type
    if credit_data[col].dtypes == 'object':
        # Impute with the most frequent value
        credit_data = credit_data.fillna(credit_data[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print(credit_data.isnull().sum())

Gender             0
Age                0
Debt               0
Married            0
BankCustomer       0
EducationLevel     0
Ethnicity          0
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode            0
Income             0
Approved           0
Approved_Status    0
dtype: int64


In [7]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le=LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in credit_data.columns.values:
    # Compare if the dtype is object
    if credit_data[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        credit_data[col]=le.fit_transform(credit_data[col])

In [14]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Drop the features 11 and 13 and convert the DataFrame to a NumPy array
credit_data = credit_data.drop(['DriversLicense', 'ZipCode'], axis=1)
credit_data = credit_data.values


# Segregate features and labels into separate variables
X,y = credit_data[:,0:13] , credit_data[:,13]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X
                                                    ,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [15]:
# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

In [16]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test,y_pred)

Accuracy of logistic regression classifier:  0.8421052631578947


array([[94,  9],
       [27, 98]], dtype=int64)

In [18]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001 ,0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are the corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

In [19]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.850725 using {'max_iter': 100, 'tol': 0.01}
