In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os

In [4]:
df = pd.read_csv(r"/home/sarthakredasani/Documents/CDAC_ML/Cases/Cases/bank/bank.csv", sep=";")
df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [5]:
X, y = df.drop('y', axis=1), df['y']

In [6]:
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25,
                                                   stratify=y)

In [7]:
ohe = OneHotEncoder(sparse_output=False, drop='first')
ct = make_column_transformer((ohe, make_column_selector(dtype_include=object)), 
                             ("passthrough",make_column_selector(dtype_exclude=object)),
                              verbose_feature_names_out=False)
scaler = StandardScaler()
lr = LogisticRegression()

In [8]:
props = [0.65, 0.7, 0.75, 0.8, 0.85, 0.9]
score = []
for p in props:
    prcomp = PCA(n_components=p).set_output(transform='pandas')
    pipe = Pipeline([('CT',ct), ('SCL',scaler), ('PCA',prcomp), ('MODEL',lr)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score.append([p, accuracy_score(y_test, y_pred)] )

df_score = pd.DataFrame(score, columns=['Prop','score'])
df_score.sort_values('score', ascending=False)

Unnamed: 0,Prop,score
4,0.85,0.899779
1,0.7,0.899042
2,0.75,0.898305
0,0.65,0.897568
5,0.9,0.897568
3,0.8,0.896831


In [9]:
lr = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
models = [lr, knn, nb]

In [10]:
props = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
score = []
for p in props:
    for m in models:
        prcomp = PCA(n_components=p).set_output(transform='pandas')
        pipe = Pipeline([('CT',ct), ('SCL',scaler), ('PCA',prcomp), ('MODEL',m)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        score.append([p, m, accuracy_score(y_test, y_pred)] )

df_score = pd.DataFrame(score, columns=['Prop','Model','score'])
df_score.sort_values('score', ascending=False)

Unnamed: 0,Prop,Model,score
33,0.95,LogisticRegression(),0.901253
27,0.85,LogisticRegression(),0.899779
18,0.7,LogisticRegression(),0.899042
12,0.6,LogisticRegression(),0.899042
6,0.5,LogisticRegression(),0.898305
21,0.75,LogisticRegression(),0.898305
15,0.65,LogisticRegression(),0.897568
30,0.9,LogisticRegression(),0.897568
9,0.55,LogisticRegression(),0.897568
24,0.8,LogisticRegression(),0.896831


In [11]:
np.cumsum(prcomp.explained_variance_ratio_)

array([0.07423688, 0.13836251, 0.19456779, 0.24088794, 0.28075778,
       0.31539875, 0.34917203, 0.38018999, 0.41019453, 0.43911258,
       0.46744757, 0.49511557, 0.52238359, 0.54863761, 0.5745841 ,
       0.59940797, 0.62415872, 0.64869834, 0.67307145, 0.69701277,
       0.71982811, 0.74253845, 0.76459597, 0.78626953, 0.80707239,
       0.82777882, 0.84762387, 0.86665075, 0.88495863, 0.90283749,
       0.91754104, 0.93198362, 0.94506519, 0.95680031])

In [12]:
prcomp.n_components_

34