### Importing_Required_Libraries

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Data Loading and Preprocessing


In [50]:
df = pd.read_csv('credit_customers (DS).csv')
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [51]:
df.shape

(1000, 21)

### Exploratory Data Analysis (EDA)

In [52]:
# 1) handle null values
df.isnull().sum()

checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
age                       0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64

In [53]:
# 2) handle duplicates
df.duplicated().sum()

0

In [54]:
# 3) check the data types
df.dtypes

checking_status            object
duration                  float64
credit_history             object
purpose                    object
credit_amount             float64
savings_status             object
employment                 object
installment_commitment    float64
personal_status            object
other_parties              object
residence_since           float64
property_magnitude         object
age                       float64
other_payment_plans        object
housing                    object
existing_credits          float64
job                        object
num_dependents            float64
own_telephone              object
foreign_worker             object
class                      object
dtype: object

In [55]:
# 4) check the target varaible
df['class'].value_counts()

class
good    700
bad     300
Name: count, dtype: int64

In [56]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# List of categorical columns to be encoded
categorical_cols = ['checking_status', 'credit_history', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])  # Encode each categorical column

# Apply One-Hot Encoding for 'purpose' column
df = pd.get_dummies(df, columns=['purpose'])  # Convert 'purpose' into dummy variables


### Defining Features and Target Variable

In [57]:
# define the target variable and features
X = df.drop('class', axis=1)  # Features: Drop the 'class' column to get the features
y = df['class']  # Target variable: The 'class' column is the target variable


print(X.shape)  #  showing the number of rows and columns
print(y.shape)  #  showing the number of rows

# Print a calculated value
print(1000*0.25)  # Output the result of the multiplication (1000 * 0.25 = 250)


(1000, 29)
(1000,)
250.0


### Splitting the Dataset into Training and Test Sets

In [58]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# X_train: Training feature set
# X_test: Test feature set
# y_train: Training target variable
# y_test: Test target variable
# test_size=0.25: 25% of the data is used for testing
# random_state=42: Ensures reproducibility by setting a seed for the random number generator


In [23]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(750, 29)
(250, 29)
(750,)
(250,)


### Training and Evaluating the Logistic Regression Model

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Initialize and train the Logistic Regression model
lr = LogisticRegression(max_iter=1000)  # ceate a Logistic Regression model with a maximum of 1000 iterations
lr.fit(X_train, y_train)  # Train the model using the training data

# predict and evaluate the model
y_pred = lr.predict(X_test)  # predict the target variable for the test set
print("Logistic Regression")
print(confusion_matrix(y_test, y_pred))  # Output the confusion matrix for the model's predictions
print(classification_report(y_test, y_pred))  # Output the classification report including precision, recall, f1-score, and support


Logistic Regression
[[ 17  55]
 [ 21 157]]
              precision    recall  f1-score   support

         bad       0.45      0.24      0.31        72
        good       0.74      0.88      0.81       178

    accuracy                           0.70       250
   macro avg       0.59      0.56      0.56       250
weighted avg       0.66      0.70      0.66       250



 ###  Converting Data to NumPy Arrays

In [60]:
import numpy as np

# Convert data to numpy arrays if they are not already
X_train = np.array(X_train)
X_test = np.array(X_test)


 ###  Training and Evaluating K-Nearest Neighbors (KNN) Classifier

In [61]:
from sklearn.neighbors import KNeighborsClassifier
# Initialize and train the model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict and evaluate
y_pred = knn.predict(X_test)
print("KNN Classification")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


KNN Classification
[[ 15  57]
 [ 26 152]]
              precision    recall  f1-score   support

         bad       0.37      0.21      0.27        72
        good       0.73      0.85      0.79       178

    accuracy                           0.67       250
   macro avg       0.55      0.53      0.53       250
weighted avg       0.62      0.67      0.64       250



### Training and Evaluating SVM Classifiers

In [36]:
from sklearn.svm import SVC

# Initialize and train the model with linear kernel
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_linear.predict(X_test)
print("SVM Classifier with Linear Kernel")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Initialize and train the model with RBF kernel
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_rbf.predict(X_test)
print("SVM Classifier with RBF Kernel")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


SVM Classifier with Linear Kernel
[[ 25  47]
 [ 19 159]]
              precision    recall  f1-score   support

         bad       0.57      0.35      0.43        72
        good       0.77      0.89      0.83       178

    accuracy                           0.74       250
   macro avg       0.67      0.62      0.63       250
weighted avg       0.71      0.74      0.71       250

SVM Classifier with RBF Kernel
[[  4  68]
 [  1 177]]
              precision    recall  f1-score   support

         bad       0.80      0.06      0.10        72
        good       0.72      0.99      0.84       178

    accuracy                           0.72       250
   macro avg       0.76      0.52      0.47       250
weighted avg       0.74      0.72      0.63       250



### Report the model with the best accuracy.

In [66]:
import joblib

# Save the SVM Classifier with Linear Kernel model which is the best model

joblib.dump(svm_linear, 'SVM_LINEAR_BEST_MODEL.pkl')


['SVM_LINEAR_BEST_MODEL.pkl']