In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("credit_customers (1).csv")

In [3]:
data.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [4]:
data.drop('checking_status',axis =1, inplace = True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   duration                1000 non-null   float64
 1   credit_history          1000 non-null   object 
 2   purpose                 1000 non-null   object 
 3   credit_amount           1000 non-null   float64
 4   savings_status          1000 non-null   object 
 5   employment              1000 non-null   object 
 6   installment_commitment  1000 non-null   float64
 7   personal_status         1000 non-null   object 
 8   other_parties           1000 non-null   object 
 9   residence_since         1000 non-null   float64
 10  property_magnitude      1000 non-null   object 
 11  age                     1000 non-null   float64
 12  other_payment_plans     1000 non-null   object 
 13  housing                 1000 non-null   object 
 14  existing_credits        1000 non-null   f

In [6]:
set(data['employment'])

{'1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed'}

In [7]:
remap_dct = {'1<=X<4':2.5, '4<=X<7':5.5, '<1':0.5, '>=7':8, 'unemployed':0}

In [8]:
data['employment'] = data['employment'].replace(remap_dct)

In [9]:
data['employment'].dtype

dtype('float64')

In [10]:
data['class']

0      good
1       bad
2      good
3      good
4       bad
       ... 
995    good
996    good
997    good
998     bad
999    good
Name: class, Length: 1000, dtype: object

In [11]:
set(data['class'])

{'bad', 'good'}

In [12]:
le_class = LabelEncoder()
le_class.fit(['bad', 'good'])
data['class'] = le_class.transform(data['class'])

In [13]:
data

Unnamed: 0,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,8.0,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,1
1,48.0,existing paid,radio/tv,5951.0,<100,2.5,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,0
2,12.0,critical/other existing credit,education,2096.0,<100,5.5,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,1
3,42.0,existing paid,furniture/equipment,7882.0,<100,5.5,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,1
4,24.0,delayed previously,new car,4870.0,<100,2.5,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12.0,existing paid,furniture/equipment,1736.0,<100,5.5,3.0,female div/dep/mar,none,4.0,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,1
996,30.0,existing paid,used car,3857.0,<100,2.5,4.0,male div/sep,none,4.0,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,1
997,12.0,existing paid,radio/tv,804.0,<100,8.0,4.0,male single,none,4.0,car,38.0,none,own,1.0,skilled,1.0,none,yes,1
998,45.0,existing paid,radio/tv,1845.0,<100,2.5,4.0,male single,none,4.0,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,0


### Label Encoding the all object type columns 

In [14]:
object_data = data.select_dtypes(include = 'object')  # selecting object type columns

In [15]:
object_data.head()

Unnamed: 0,credit_history,purpose,savings_status,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker
0,critical/other existing credit,radio/tv,no known savings,male single,none,real estate,none,own,skilled,yes,yes
1,existing paid,radio/tv,<100,female div/dep/mar,none,real estate,none,own,skilled,none,yes
2,critical/other existing credit,education,<100,male single,none,real estate,none,own,unskilled resident,none,yes
3,existing paid,furniture/equipment,<100,male single,guarantor,life insurance,none,for free,skilled,none,yes
4,delayed previously,new car,<100,male single,none,no known property,none,for free,skilled,none,yes


In [16]:
object_array = object_data.values  # converting into array

In [17]:
object_array.shape[1]

11

In [18]:
for i in range(object_array.shape[1]): #16 -> 0,1,2,3,4,5.....11
    le = LabelEncoder()
    Unique_labels = set(object_array[:,i])
    Unique_labels = list(Unique_labels)   # to convert set into array we have to list the set and then convert into array
    Unique_labels = np.array(Unique_labels)
    le.fit(Unique_labels)
    object_array[:,i] = le.transform(object_array[:,i])

In [19]:
object_array

array([[1, 6, 4, ..., 1, 1, 1],
       [3, 6, 2, ..., 1, 0, 1],
       [1, 2, 2, ..., 3, 0, 1],
       ...,
       [3, 6, 2, ..., 1, 0, 1],
       [3, 6, 2, ..., 1, 1, 1],
       [1, 9, 0, ..., 1, 0, 1]], dtype=object)

In [20]:
object_data

Unnamed: 0,credit_history,purpose,savings_status,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker
0,1,6,4,3,2,3,1,1,1,1,1
1,3,6,2,0,2,3,1,1,1,0,1
2,1,2,2,3,2,3,1,1,3,0,1
3,3,3,2,3,1,1,1,0,1,0,1
4,2,4,2,3,2,2,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
995,3,3,2,0,2,3,1,1,3,0,1
996,3,9,2,1,2,1,1,1,0,1,1
997,3,6,2,3,2,0,1,1,1,0,1
998,3,6,2,3,2,2,1,0,1,1,1


In [21]:
# converting the 11 object type columns to integer type 

for colname in object_data.columns:
    print(colname)
    object_data[colname] = object_data[colname].astype(int)

credit_history
purpose
savings_status
personal_status
other_parties
property_magnitude
other_payment_plans
housing
job
own_telephone
foreign_worker


### Getting all columns together

In [22]:
data1 = object_data  

In [23]:
data2 = data.select_dtypes(include=['int','float'])  # getting all the columns with float and integer type columns

In [24]:
data2

Unnamed: 0,duration,credit_amount,employment,installment_commitment,residence_since,age,existing_credits,num_dependents,class
0,6.0,1169.0,8.0,4.0,4.0,67.0,2.0,1.0,1
1,48.0,5951.0,2.5,2.0,2.0,22.0,1.0,1.0,0
2,12.0,2096.0,5.5,2.0,3.0,49.0,1.0,2.0,1
3,42.0,7882.0,5.5,2.0,4.0,45.0,1.0,2.0,1
4,24.0,4870.0,2.5,3.0,4.0,53.0,2.0,2.0,0
...,...,...,...,...,...,...,...,...,...
995,12.0,1736.0,5.5,3.0,4.0,31.0,1.0,1.0,1
996,30.0,3857.0,2.5,4.0,4.0,40.0,1.0,1.0,1
997,12.0,804.0,8.0,4.0,4.0,38.0,1.0,1.0,1
998,45.0,1845.0,2.5,4.0,4.0,23.0,1.0,1.0,0


In [25]:
data = pd.concat([data1,data2],axis = 1)   # getting all columns under one variable

data.columns

### Splitting data into Depedent and Independent variables 

In [26]:
X = data.drop('class', axis=1) 

In [27]:
X

Unnamed: 0,credit_history,purpose,savings_status,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker,duration,credit_amount,employment,installment_commitment,residence_since,age,existing_credits,num_dependents
0,1,6,4,3,2,3,1,1,1,1,1,6.0,1169.0,8.0,4.0,4.0,67.0,2.0,1.0
1,3,6,2,0,2,3,1,1,1,0,1,48.0,5951.0,2.5,2.0,2.0,22.0,1.0,1.0
2,1,2,2,3,2,3,1,1,3,0,1,12.0,2096.0,5.5,2.0,3.0,49.0,1.0,2.0
3,3,3,2,3,1,1,1,0,1,0,1,42.0,7882.0,5.5,2.0,4.0,45.0,1.0,2.0
4,2,4,2,3,2,2,1,0,1,0,1,24.0,4870.0,2.5,3.0,4.0,53.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,3,2,0,2,3,1,1,3,0,1,12.0,1736.0,5.5,3.0,4.0,31.0,1.0,1.0
996,3,9,2,1,2,1,1,1,0,1,1,30.0,3857.0,2.5,4.0,4.0,40.0,1.0,1.0
997,3,6,2,3,2,0,1,1,1,0,1,12.0,804.0,8.0,4.0,4.0,38.0,1.0,1.0
998,3,6,2,3,2,2,1,0,1,1,1,45.0,1845.0,2.5,4.0,4.0,23.0,1.0,1.0


In [28]:
y = data['class']

In [29]:
y

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: class, Length: 1000, dtype: int32

In [30]:
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

In [31]:
x_train.shape

(800, 19)

In [32]:
x_test.shape

(200, 19)

### Training the model 

In [33]:
log_r = LogisticRegression()

log_r.fit(x_train,y_train)

y_pred = log_r.predict(x_test)

accuracy_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.72

### Trying with different types of Solvers 

In [34]:
log_r = LogisticRegression(solver = "liblinear")
log_r.fit(x_train,y_train)
y_pred = log_r.predict(x_test)
accuracy_score(y_test,y_pred)

0.71

In [35]:
log_r = LogisticRegression(solver = "newton-cg")
log_r.fit(x_train,y_train)
y_pred = log_r.predict(x_test)
accuracy_score(y_test,y_pred)

0.705

In [36]:
log_r = LogisticRegression(solver = "saga")
log_r.fit(x_train,y_train)
y_pred = log_r.predict(x_test)
accuracy_score(y_test,y_pred)



0.685

In [37]:
log_r = LogisticRegression(solver = "lbfgs")
log_r.fit(x_train,y_train)
y_pred = log_r.predict(x_test)
accuracy_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.72

In [38]:
log_r = LogisticRegression(solver = "sag")
log_r.fit(x_train,y_train)
y_pred = log_r.predict(x_test)
accuracy_score(y_test,y_pred)



0.685

### Fine tuning the model to get the best parameters 

In [39]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
# Defining hyperparameters to tune
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [41]:
# Creating logistic regression model
logistic = LogisticRegression()

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(logistic, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [42]:
# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Create a new model with the best hyperparameters
best_model = LogisticRegression(**best_params)
best_model.fit(X_train, y_train)

# Evaluate the best model
accuracy = best_model.score(X_test, y_test)
print(f"Accuracy of the best model on test set: {accuracy}")

Best Parameters: {'C': 0.01, 'solver': 'newton-cg'}
Accuracy of the best model on test set: 0.725


### As we are getting highest accuracy with newton-cg solver we will use that for further working