In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Load the CSV file into a DataFrame
df = pd.read_csv('bank-full.csv', delimiter=';')

# Display the first few rows of the DataFrame
print(df.head())
print(df['housing'].unique())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
['yes' 'no']


In [2]:
from sklearn.metrics import confusion_matrix, accuracy_score

def calculate_metrics(y_true, y_pred):
    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate True Positive Rate (Sensitivity/Recall)
    tpr = tp / (tp + fn) if (tp + fn) != 0 else 0

    # Calculate True Negative Rate (Specificity)
    tnr = tn / (tn + fp) if (tn + fp) != 0 else 0

    return accuracy, tpr, tnr

In [3]:
# Mapping for month conversion
month_mapping = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
}
df['month'] = df['month'].map(month_mapping)
# Convert 'housing', 'default', and 'loan' to boolean
df['housing'] = df['housing'].map({'yes': True, 'no': False})
df['default'] = df['default'].map({'yes': True, 'no': False})
df['loan'] = df['loan'].map({'yes': True, 'no': False})

# Separate the dataset into features (X) and target (y)
X = df.drop('y', axis=1)
y = df['y']

# Create dummy variables for the remaining categorical features in X
X_dummies = pd.get_dummies(X)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, test_size=0.2, random_state=42)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36168 entries, 3344 to 15795
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  36168 non-null  int64
 1   default              36168 non-null  bool 
 2   balance              36168 non-null  int64
 3   housing              36168 non-null  bool 
 4   loan                 36168 non-null  bool 
 5   day                  36168 non-null  int64
 6   month                36168 non-null  int64
 7   duration             36168 non-null  int64
 8   campaign             36168 non-null  int64
 9   pdays                36168 non-null  int64
 10  previous             36168 non-null  int64
 11  job_admin.           36168 non-null  bool 
 12  job_blue-collar      36168 non-null  bool 
 13  job_entrepreneur     36168 non-null  bool 
 14  job_housemaid        36168 non-null  bool 
 15  job_management       36168 non-null  bool 
 16  job_retired          361

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time
X_train_KNN = np.array(X_train)
X_test_KNN = np.array(X_test)
# K-Nearest Neighbors pipeline
knn_pipeline = Pipeline([
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

# Start timing
start_time = time.time()

# Fit the model
knn_pipeline.fit(X_train_KNN, y_train)

# Calculate runtime
knn_train_time = time.time() - start_time

# Score on train and test sets
knn_train_score = np.mean(knn_pipeline.predict(X_train_KNN)== y_train)
knn_test_score = np.mean(knn_pipeline.predict(X_test_KNN)== y_test)
knn_predictions = knn_pipeline.predict(X_test_KNN)
knn_accuracy, knn_tpr, knn_tnr = calculate_metrics(y_test, knn_predictions)
knn_train_time, knn_train_score, knn_accuracy, knn_tpr, knn_tnr

(0.06169271469116211,
 0.9118004866180048,
 0.875704965166427,
 0.27956003666361134,
 0.9574949698189135)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import time

# Logistic Regression pipeline
logreg_pipeline = Pipeline([
    ('logreg', LogisticRegression(max_iter=10000))
])

# Start timing
start_time = time.time()

# Fit the model
logreg_pipeline.fit(X_train, y_train)

# Calculate runtime
logreg_train_time = time.time() - start_time

# Score on train and test sets
logreg_train_score = logreg_pipeline.score(X_train, y_train)
logreg_test_score = logreg_pipeline.score(X_test, y_test)
logreg_predictions = logreg_pipeline.predict(X_test)
logreg_accuracy, logreg_tpr, logreg_tnr = calculate_metrics(y_test, logreg_predictions)
logreg_train_time, logreg_train_score, logreg_accuracy, logreg_tpr, logreg_tnr

(2.164151191711426,
 0.9010451227604512,
 0.8979321021784806,
 0.32997250229147573,
 0.9758551307847082)

In [6]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree pipeline
dtree_pipeline = Pipeline([
    ('dtree', DecisionTreeClassifier())
])

# Start timing
start_time = time.time()

# Fit the model
dtree_pipeline.fit(X_train, y_train)

# Calculate runtime
dtree_train_time = time.time() - start_time

# Score on train and test sets
dtree_predictions = dtree_pipeline.predict(X_test)
dtree_accuracy, dtree_tpr, dtree_tnr = calculate_metrics(y_test, dtree_predictions)
dtree_train_time, dtree_accuracy, dtree_tpr, dtree_tnr

(0.24161672592163086,
 0.873382726971138,
 0.4940421631530706,
 0.9254275653923542)

In [7]:
from sklearn.svm import SVC

# Support Vector Machine pipeline
svm_pipeline = Pipeline([
    ('svm', SVC())
])

# Start timing
start_time = time.time()

# Fit the model
svm_pipeline.fit(X_train, y_train)

# Calculate runtime
svm_train_time = time.time() - start_time

# Score on train and test sets
svm_predictions = svm_pipeline.predict(X_test)
svm_accuracy, svm_tpr, svm_tnr = calculate_metrics(y_test, svm_predictions)
svm_train_time,svm_accuracy, svm_tpr, svm_tnr


(12.310591220855713,
 0.8794647793873714,
 0.006416131989000917,
 0.9992454728370221)

In [8]:
results_df = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Decision Tree', 'SVM'],
    'Training Time (seconds)': [knn_train_time, logreg_train_time, dtree_train_time, svm_train_time],
    'Test Accuracy': [knn_accuracy, logreg_accuracy, dtree_accuracy, svm_accuracy],
    'True Positive Rate': [knn_tpr, logreg_tpr, dtree_tpr, svm_tpr],
    'True Negative Rate': [knn_tnr, logreg_tnr, dtree_tnr, svm_tnr]
})
results_df

Unnamed: 0,Model,Training Time (seconds),Test Accuracy,True Positive Rate,True Negative Rate
0,KNN,0.061693,0.875705,0.27956,0.957495
1,Logistic Regression,2.164151,0.897932,0.329973,0.975855
2,Decision Tree,0.241617,0.873383,0.494042,0.925428
3,SVM,12.310591,0.879465,0.006416,0.999245


In [9]:

# K-Nearest Neighbors pipeline
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

# Start timing
start_time = time.time()

# Fit the model
knn_pipeline.fit(X_train_KNN, y_train)

# Calculate runtime
knn_train_time = time.time() - start_time

# Score on train and test sets
knn_predictions = knn_pipeline.predict(X_test_KNN)
knn_accuracy, knn_tpr, knn_tnr = calculate_metrics(y_test, knn_predictions)
knn_train_time, knn_train_score, knn_accuracy, knn_tpr, knn_tnr

(0.07643747329711914,
 0.9118004866180048,
 0.8909653875926131,
 0.31897341888175984,
 0.9694416498993964)

In [10]:
# Logistic Regression pipeline
logreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=10000))
])

# Start timing
start_time = time.time()

# Fit the model
logreg_pipeline.fit(X_train, y_train)

# Calculate runtime
logreg_train_time = time.time() - start_time

# Score on train and test sets
logreg_predictions = logreg_pipeline.predict(X_test)
logreg_accuracy, logreg_tpr, logreg_tnr = calculate_metrics(y_test, logreg_predictions)
logreg_train_time, logreg_train_score, logreg_accuracy, logreg_tpr, logreg_tnr

(0.0880274772644043,
 0.9010451227604512,
 0.8977109366360722,
 0.3290559120073327,
 0.9757293762575453)

In [11]:

# Support Vector Machine pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Start timing
start_time = time.time()

# Fit the model
svm_pipeline.fit(X_train, y_train)

# Calculate runtime
svm_train_time = time.time() - start_time

# Score on train and test sets
svm_predictions = svm_pipeline.predict(X_test)
svm_accuracy, svm_tpr, svm_tnr = calculate_metrics(y_test, svm_predictions)
svm_train_time,svm_accuracy, svm_tpr, svm_tnr


(14.3332200050354, 0.8967156916952339, 0.27956003666361134, 0.9813883299798792)

In [12]:
results_df = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Decision Tree', 'SVM'],
    'Training Time (seconds)': [knn_train_time, logreg_train_time, dtree_train_time, svm_train_time],
    'Test Accuracy': [knn_accuracy, logreg_accuracy, dtree_accuracy, svm_accuracy],
    'True Positive Rate': [knn_tpr, logreg_tpr, dtree_tpr, svm_tpr],
    'True Negative Rate': [knn_tnr, logreg_tnr, dtree_tnr, svm_tnr]
})

In [13]:
y_test.value_counts()[0]/len(y_test)

  y_test.value_counts()[0]/len(y_test)


0.8793541966161672

In [14]:
y_test.value_counts()

y
no     7952
yes    1091
Name: count, dtype: int64

In [17]:

# K-Nearest Neighbors pipeline
# Custom weight function

    
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

# Start timing
start_time = time.time()

# Fit the model
knn_pipeline.fit(X_train_KNN, y_train)

# Calculate runtime
knn_train_time = time.time() - start_time

# Score on train and test sets
knn_predictions = knn_pipeline.predict(X_test_KNN)

X_test_scaled = knn_pipeline.named_steps['scaler'].transform(X_test)

# Use the KNN model to find the neighbors
knn_model = knn_pipeline.named_steps['knn']
neighbors = knn_model.kneighbors(X_test_scaled, return_distance=False)

threshold = 0.3  # 30%
custom_predictions = []
for neighbor_indices in neighbors:
    # Count 'yes' neighbors
    yes_votes = sum(y_train.iloc[neighbor_indices] == 'yes')
    # Apply the custom decision rule
    if yes_votes / len(neighbor_indices) >= threshold:
        custom_predictions.append('yes')
    else:
        custom_predictions.append('no')

# Convert to a NumPy array for performance
custom_predictions = np.array(custom_predictions)



knn_accuracy, knn_tpr, knn_tnr = calculate_metrics(y_test, custom_predictions)
knn_train_time, knn_train_score, knn_accuracy, knn_tpr, knn_tnr



(0.08244132995605469,
 0.9118004866180048,
 0.8719451509454826,
 0.5041246562786434,
 0.9224094567404426)

In [15]:
results_df

Unnamed: 0,Model,Training Time (seconds),Test Accuracy,True Positive Rate,True Negative Rate
0,KNN,0.076437,0.890965,0.318973,0.969442
1,Logistic Regression,0.088027,0.897711,0.329056,0.975729
2,Decision Tree,0.241617,0.873383,0.494042,0.925428
3,SVM,14.33322,0.896716,0.27956,0.981388


(0.24678349494934082,
 0.877031958420878,
 0.4995417048579285,
 0.9288229376257545)