In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [24]:
# Step 1: Load the dataset
# Use semicolon as the delimiter
data = pd.read_csv("/content/drive/MyDrive/datasetsss/bank-full.csv", delimiter=";")

In [25]:
# Step 2: Understanding target variable
print("Target Variable Distribution:\n", data['loan'].value_counts())

# Step 3: Check for duplicates
data.drop_duplicates(inplace=True)

# Step 4: Check for missing values
print("Missing Values:\n", data.isnull().sum())

# Step 5: Handle missing values (if any)
data.fillna(method='ffill', inplace=True)

Target Variable Distribution:
 loan
no     37967
yes     7244
Name: count, dtype: int64
Missing Values:
 age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


  data.fillna(method='ffill', inplace=True)


In [26]:
# Step 6: Encoding categorical variables
categorical_columns = data.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

In [27]:
# Step 7: Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('loan', axis=1))

In [28]:
# Step 8: Splitting data
X = scaled_features
y = data['loan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:
# Step 9: Apply multiple machine learning models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}

In [30]:
# Step 10: Train and evaluate models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    print(classification_report(y_test, predictions))

Logistic Regression Accuracy: 84.02%
              precision    recall  f1-score   support

           0       0.84      1.00      0.91     11398
           1       0.40      0.00      0.00      2166

    accuracy                           0.84     13564
   macro avg       0.62      0.50      0.46     13564
weighted avg       0.77      0.84      0.77     13564

Decision Tree Accuracy: 74.87%
              precision    recall  f1-score   support

           0       0.86      0.84      0.85     11398
           1       0.24      0.27      0.26      2166

    accuracy                           0.75     13564
   macro avg       0.55      0.56      0.55     13564
weighted avg       0.76      0.75      0.75     13564

Random Forest Accuracy: 84.08%
              precision    recall  f1-score   support

           0       0.85      0.99      0.91     11398
           1       0.51      0.07      0.12      2166

    accuracy                           0.84     13564
   macro avg       0.68      