# Lab 4: Supervised Learning – Binary Classification

### Imports

In [78]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Dataset

In [2]:
df = pd.read_csv('datasets/UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


## Exercise 1: Data Understanding and Preprocessing

### 1. Load the dataset and display its shape, info, and summary statistics.

In [3]:
df.shape

(30000, 25)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

In [5]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


### 2. Identify and convert categorical columns (e.g., SEX, EDUCATION, MARRIAGE) using Label Encoding or One-Hot Encoding.

In [6]:
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE']

In [9]:
le = LabelEncoder()

In [10]:
df['SEX_LE'] = le.fit_transform(df[cat_cols[0]])
df['SEX_LE'].head()

0    1
1    1
2    1
3    1
4    0
Name: SEX_LE, dtype: int64

In [11]:
df1 = pd.get_dummies(df, columns=['EDUCATION'], prefix='EDU', dtype='int64')

In [12]:
df1 = pd.get_dummies(df1, columns=['MARRIAGE'], prefix='MAR', dtype='int64')

In [13]:
df1.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
       'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default.payment.next.month',
       'SEX_LE', 'EDU_0', 'EDU_1', 'EDU_2', 'EDU_3', 'EDU_4', 'EDU_5', 'EDU_6',
       'MAR_0', 'MAR_1', 'MAR_2', 'MAR_3'],
      dtype='object')

### 3. Check for missing values or duplicates and handle them appropriately.

In [14]:
df1.isnull().sum().sum()

np.int64(0)

In [15]:
df1.duplicated().sum()

np.int64(0)

### 4. Normalize/standardize numerical features (LIMIT_BAL, AGE, BILL_AMT*, PAY_AMT*).

In [31]:
sc = StandardScaler()
minMax = MinMaxScaler()

In [21]:
df1['AGE_SC'] = sc.fit_transform(df1[['AGE']])
df1['AGE_SC']

0       -1.246020
1       -1.029047
2       -0.161156
3        0.164303
4        2.334029
           ...   
29995    0.381275
29996    0.815221
29997    0.164303
29998    0.598248
29999    1.140680
Name: AGE_SC, Length: 30000, dtype: float64

In [22]:
df1['LIMIT_BAL_SC'] = sc.fit_transform(df1[['LIMIT_BAL']])
df1['LIMIT_BAL_SC']

0       -1.136720
1       -0.365981
2       -0.597202
3       -0.905498
4       -0.905498
           ...   
29995    0.404759
29996   -0.134759
29997   -1.059646
29998   -0.674276
29999   -0.905498
Name: LIMIT_BAL_SC, Length: 30000, dtype: float64

In [32]:
col_to_Min_Max = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

In [34]:
df1[col_to_Min_Max] = minMax.fit_transform(df1[col_to_Min_Max])

In [35]:
df1[col_to_Min_Max].min()

BILL_AMT1    0.0
BILL_AMT2    0.0
BILL_AMT3    0.0
BILL_AMT4    0.0
BILL_AMT5    0.0
BILL_AMT6    0.0
PAY_AMT1     0.0
PAY_AMT2     0.0
PAY_AMT3     0.0
PAY_AMT4     0.0
PAY_AMT5     0.0
PAY_AMT6     0.0
dtype: float64

In [36]:
df1[col_to_Min_Max].max()

BILL_AMT1    1.0
BILL_AMT2    1.0
BILL_AMT3    1.0
BILL_AMT4    1.0
BILL_AMT5    1.0
BILL_AMT6    1.0
PAY_AMT1     1.0
PAY_AMT2     1.0
PAY_AMT3     1.0
PAY_AMT4     1.0
PAY_AMT5     1.0
PAY_AMT6     1.0
dtype: float64

### 5. Split the data into training and testing sets (e.g., 80:20).

In [24]:
df1.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
       'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default.payment.next.month',
       'SEX_LE', 'EDU_0', 'EDU_1', 'EDU_2', 'EDU_3', 'EDU_4', 'EDU_5', 'EDU_6',
       'MAR_0', 'MAR_1', 'MAR_2', 'MAR_3', 'AGE_SC', 'LIMIT_BAL_SC'],
      dtype='object')

In [42]:
X = df1.drop(columns=['default.payment.next.month'])
y = df1['default.payment.next.month']

In [43]:
X.shape, y.shape

((30000, 36), (30000,))

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [50]:
X_train.shape, X_test.shape

((22500, 36), (7500, 36))

In [51]:
y_train.shape, y_test.shape

((22500,), (7500,))

In [88]:
lr_model = LogisticRegression(max_iter=10000)

In [89]:
lr_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [90]:
y_pred = lr_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [91]:
def print_metrics(yt, yp):
    print("classification_report")
    print(classification_report(yt, yp))
    print("Pricition")
    print(precision_score(yt, yp))
    print("Confusion Matix")
    print(confusion_matrix(yt, yp))

In [92]:
print_metrics(y_test, y_pred)

classification_report
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      5844
           1       0.72      0.24      0.36      1656

    accuracy                           0.81      7500
   macro avg       0.77      0.61      0.62      7500
weighted avg       0.80      0.81      0.77      7500

Pricition
0.7242647058823529
Confusion Matix
[[5694  150]
 [1262  394]]


In [74]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [75]:
print_metrics(y_test, y_pred)

classification_report
              precision    recall  f1-score   support

           0       0.79      0.92      0.85      5844
           1       0.28      0.12      0.17      1656

    accuracy                           0.74      7500
   macro avg       0.53      0.52      0.51      7500
weighted avg       0.67      0.74      0.70      7500

Pricition
0.2832116788321168
Confusion Matix
[[5353  491]
 [1462  194]]


In [76]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [77]:
print_metrics(y_test, y_pred)

classification_report
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      5844
           1       0.39      0.42      0.40      1656

    accuracy                           0.73      7500
   macro avg       0.61      0.62      0.61      7500
weighted avg       0.73      0.73      0.73      7500

Pricition
0.38774373259052924
Confusion Matix
[[4745 1099]
 [ 960  696]]


In [79]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [80]:
print_metrics(y_test, y_pred)

classification_report
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      5844
           1       0.68      0.38      0.48      1656

    accuracy                           0.82      7500
   macro avg       0.76      0.66      0.69      7500
weighted avg       0.81      0.82      0.80      7500

Pricition
0.6827661909989023
Confusion Matix
[[5555  289]
 [1034  622]]
