In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("bank+marketing/bank-additional/bank-additional-full.csv", sep=';', quotechar='"')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [2]:
print(df["cons.conf.idx"].value_counts())

cons.conf.idx
-36.4    7763
-42.7    6685
-46.2    5794
-36.1    5175
-41.8    4374
-42.0    3616
-47.1    2458
-31.4     770
-40.8     715
-26.9     447
-30.1     357
-40.3     311
-37.5     303
-50.0     282
-29.8     267
-34.8     264
-38.3     233
-39.8     229
-40.0     212
-49.5     204
-33.6     178
-34.6     174
-33.0     172
-50.8     128
-40.4      67
-45.9      10
Name: count, dtype: int64


In [3]:
#data cleaning.
df.drop(columns=["cons.price.idx"], inplace=True)
df.drop(columns=["duration"], inplace=True)
#one-hot encoding the target variable.
df['y'] = df['y'].map({'no': 0, 'yes': 1})
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0


In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

#seperating features from target variable.
X = df.drop('y', axis=1)
Y = df['y']

numerical_cols = ["age", "campaign", "pdays", "previous", "emp.var.rate", "cons.conf.idx", "euribor3m", "nr.employed"]
categorical_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]

preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols), #standardizing numerical features.
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
        ])
partitions = [
    (0.2, 0.8, "20/80"),
    (0.5, 0.5, "50/50"),
    (0.8, 0.2, "80/20")
]

In [5]:
#XG-boost classification code.
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

results = []
for train_size, test_size, partition_name in partitions:
    print(f"\n{'='*50}")
    print(f"PARTITION: {partition_name} (Train/Test)")
    print(f"{'='*50}")

    current_partition_results = []
    #running 3 trials.
    for trial_number in range(3):
        print(f"Trial number: {trial_number + 1}")

        #splitting training and testing data, and then creating a XG boost pipeline.
        X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=test_size, random_state=42+trial_number, stratify=Y)
         #going to scale to handle class imbalance of target variable.
        ratio = len(Y_train[Y_train == 0]) / len(Y_train[Y_train == 1])
        xgb_pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', xgb.XGBClassifier(random_state=42 + trial_number,eval_metric='logloss', scale_pos_weight=ratio))])

        #Hyperparameters for XGBoost.
        parameter_grid = {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [3, 5, 7],
            'classifier__learning_rate': [0.01, 0.05, 0.1],
            'classifier__subsample': [0.8, 0.9, 1.0],
            'classifier__colsample_bytree': [0.8, 0.9, 1.0],
            'classifier__min_child_weight': [1, 5, 10]  # Helps with imbalanced data
        }

        # Tuning hyperparameters
        grid_search = GridSearchCV(
            xgb_pipeline, param_grid=parameter_grid,
            n_jobs=-1, verbose=0, cv=5, scoring='accuracy')
        grid_search.fit(X_train, Y_train)
        best_model = grid_search.best_estimator_

        # Getting predictions
        y_training_prediction = best_model.predict(X_train)
        y_testing_prediction = best_model.predict(X_test)

        # Calculating accuracies
        training_accuracy = accuracy_score(Y_train, y_training_prediction)
        testing_accuracy = accuracy_score(Y_test, y_testing_prediction)

        print(f"  Best params: {grid_search.best_params_}")
        print(f"  Train Accuracy: {training_accuracy:.4f}")
        print(f"  Test Accuracy: {testing_accuracy:.4f}")
        print(f"  CV Score: {grid_search.best_score_:.4f}")

        # Store trial results
        result = {
            'partition': partition_name,
            'trial': trial_number + 1,
            'training_accuracy': training_accuracy,
            'testing_accuracy': testing_accuracy,
            'cv_score': grid_search.best_score_,
            'best_parameters': grid_search.best_params_,
        }
        current_partition_results.append(result)

    # Calculate averages for this partition
    avg_train = np.mean([r["training_accuracy"] for r in current_partition_results])
    avg_test = np.mean([r["testing_accuracy"] for r in current_partition_results])
    avg_cv = np.mean([r["cv_score"] for r in current_partition_results])

    avg_score_summary = {
        'partition': partition_name,
        'avg_train_accuracy': avg_train,
        'avg_test_accuracy': avg_test,
        'avg_cv_score': avg_cv,
        'trials': current_partition_results,
    }
    results.append(avg_score_summary)


PARTITION: 20/80 (Train/Test)
Trial number: 1
  Best params: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 300, 'classifier__subsample': 0.9}
  Train Accuracy: 0.9817
  Test Accuracy: 0.8637
  CV Score: 0.8689
Trial number: 2
  Best params: {'classifier__colsample_bytree': 0.8, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 300, 'classifier__subsample': 0.8}
  Train Accuracy: 0.9805
  Test Accuracy: 0.8663
  CV Score: 0.8717
Trial number: 3
  Best params: {'classifier__colsample_bytree': 1.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 300, 'classifier__subsample': 0.9}
  Train Accuracy: 0.9854
  Test Accuracy: 0.8681
  CV Score: 0.8700

PARTITION: 50/50 (Train/Test)
Trial number: 1
  Best params: {'classifier__co

In [6]:
# Print final summary
print(f"\n{'='*60}")
print("Final summary - XGBoost on Bank Marketing Dataset")
print(f"{'='*60}")

for result in results:
    print(f"Partition: {result['partition']}")
    print(f"  Avg Train Accuracy: {result['avg_train_accuracy']:.4f}")
    print(f"  Avg CV Accuracy: {result['avg_cv_score']:.4f}")
    print(f"  Avg Test Accuracy: {result['avg_test_accuracy']:.4f}")
    print()

compiled_results = []
for result in results:
    for partition in result["trials"]:
        compiled_results.append(partition)

xgboost_results_csv = pd.DataFrame(compiled_results)
xgboost_results_csv.to_csv("xgboost_results.csv", index=False)
print("results saved to xgboost_results.csv successfully!")


Final summary - XGBoost on Bank Marketing Dataset
Partition: 20/80
  Avg Train Accuracy: 0.9825
  Avg CV Accuracy: 0.8702
  Avg Test Accuracy: 0.8661

Partition: 50/50
  Avg Train Accuracy: 0.9439
  Avg CV Accuracy: 0.8550
  Avg Test Accuracy: 0.8520

Partition: 80/20
  Avg Train Accuracy: 0.8968
  Avg CV Accuracy: 0.8496
  Avg Test Accuracy: 0.8494

results saved to xgboost_results.csv successfully!
