<a href="https://colab.research.google.com/github/PallaviVangari/DataMiningAssignment3/blob/main/SEMMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import make_pipeline   # Add this import
from imblearn.over_sampling import SMOTE


# Load the dataset
url = "/content/bank.csv"  # Replace with your dataset URL if hosted online
data = pd.read_csv(url, delimiter=';')

# One-hot encode the categorical columns
data_encoded = pd.get_dummies(data, drop_first=True)

# Split the data into training and test sets (80% train, 20% test)
X = data_encoded.drop('y_yes', axis=1)
y = data_encoded['y_yes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data to handle imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Logistic Regression
logreg = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
logreg.fit(X_train_smote, y_train_smote)
y_pred_logreg = logreg.predict(X_test)

# Decision Tree
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train_smote, y_train_smote)
y_pred_dtree = dtree.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test)

# Evaluation
models = ['Logistic Regression', 'Decision Tree', 'Random Forest']
predictions = [y_pred_logreg, y_pred_dtree, y_pred_rf]

for i, model in enumerate(models):
    print(f"Performance Metrics for {model}:")
    print(f"Accuracy: {accuracy_score(y_test, predictions[i])}")
    print(f"Precision: {precision_score(y_test, predictions[i])}")
    print(f"Recall: {recall_score(y_test, predictions[i])}")
    print(f"F1-Score: {f1_score(y_test, predictions[i])}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, predictions[i])}\n")


Performance Metrics for Logistic Regression:
Accuracy: 0.8784530386740331
Precision: 0.4318181818181818
Recall: 0.3877551020408163
F1-Score: 0.4086021505376344
Confusion Matrix:
[[757  50]
 [ 60  38]]

Performance Metrics for Decision Tree:
Accuracy: 0.8530386740331491
Precision: 0.37037037037037035
Recall: 0.5102040816326531
F1-Score: 0.42918454935622313
Confusion Matrix:
[[722  85]
 [ 48  50]]

Performance Metrics for Random Forest:
Accuracy: 0.8950276243093923
Precision: 0.5223880597014925
Recall: 0.35714285714285715
F1-Score: 0.4242424242424242
Confusion Matrix:
[[775  32]
 [ 63  35]]

