In [8]:
!pip install xgboost




In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = r"C:\Users\samia\OneDrive\Documents\GitHub\springboard\Project Proposal\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv"

# Check if the file exists and load the data
if os.path.exists(file_path):
    bank_additional_full = pd.read_csv(file_path, sep=';')  # Ensure correct separator
else:
    print(f"File not found: {file_path}")
    exit()

In [10]:
# Handle missing values
bank_additional_full.fillna(bank_additional_full.median(numeric_only=True), inplace=True)

# Identify feature columns
target_column = "y"  # Update if needed
if target_column not in bank_additional_full.columns:
    print("Error: Target column not found.")
    exit()

X = bank_additional_full.drop(columns=[target_column])
y = bank_additional_full[target_column].map({'yes': 1, 'no': 0})  # Convert target to binary

In [12]:
print(X_train.shape)
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

(32950, 20)


In [13]:
# Convert categorical columns to dummies
categorical_columns = X_train.select_dtypes(include=['object']).columns
X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)


In [14]:
# Ensure train & test have the same columns
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0  # Add missing columns in test set
X_test = X_test[X_train.columns]  # Reorder columns to match training set


In [15]:
# Standardize Numeric Features
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X_train.loc[:, numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test.loc[:, numeric_columns] = scaler.transform(X_test[numeric_columns])

In [16]:
# Train and evaluate Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.915877640203933
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      7310
           1       0.67      0.49      0.57       928

    accuracy                           0.92      8238
   macro avg       0.81      0.73      0.76      8238
weighted avg       0.91      0.92      0.91      8238



In [17]:
# Train and evaluate XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.9174556931294003
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      7310
           1       0.66      0.55      0.60       928

    accuracy                           0.92      8238
   macro avg       0.80      0.76      0.78      8238
weighted avg       0.91      0.92      0.91      8238



In [18]:
# Train and evaluate Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9159990288905074
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      7310
           1       0.71      0.43      0.54       928

    accuracy                           0.92      8238
   macro avg       0.82      0.70      0.75      8238
weighted avg       0.91      0.92      0.91      8238

