In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("bank+marketing/bank-additional/bank-additional-full.csv", sep=';', quotechar='"')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
#data cleaning.
df.drop(columns=["cons.price.idx"], inplace=True)
df.drop(columns=["duration"], inplace=True)
#one-hot encoding the target variable.
df['y'] = df['y'].map({'no': 0, 'yes': 1})
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,-36.4,4.857,5191.0,0


In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

#seperating features from target variable.
X = df.drop('y', axis=1)
Y = df['y']

numerical_cols = ["age", "campaign", "pdays", "previous", "emp.var.rate", "cons.conf.idx", "euribor3m", "nr.employed"]
categorical_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]

preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols), #standardizing numerical features.
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
        ])
partitions = [
    (0.2, 0.8, "20/80"),
    (0.5, 0.5, "50/50"),
    (0.8, 0.2, "80/20")
]

In [7]:
#Random forest classification code.
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

results = []
for train_size, test_size, partition_name in partitions:
    print(f"\n{'='*50}")
    print(f"PARTITION: {partition_name} (Train/Test)")
    print(f"{'='*50}")

    current_partition_results = []
    #running 3 trials.
    for trial_number in range(3):
        print(f"Trial number: {trial_number + 1}")

        #splitting training and testing data, and then creating a random_forest pipeline.
        X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=test_size, random_state=42+trial_number, stratify=Y)
        random_forest_pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42 + trial_number,                                                                                                  n_jobs=-1, class_weight='balanced'))])
        # Hyperparameters for Random Forest
        parameter_grid = {
            'classifier__n_estimators': [100, 150], #keeping fewer trees.
            'classifier__max_depth': [5, 10, 15], #keeping tree smaller.
            'classifier__min_samples_split': [5, 10],
            'classifier__min_samples_leaf': [2, 4],
            'classifier__max_features': ['sqrt']
        }

        # Tuning hyperparameters
        grid_search = GridSearchCV(
            random_forest_pipeline, param_grid=parameter_grid,
            n_jobs=-1, verbose=0, cv=5, scoring='accuracy'
        )
        grid_search.fit(X_train, Y_train)
        best_model = grid_search.best_estimator_

        # Getting predictions
        y_training_prediction = best_model.predict(X_train)
        y_testing_prediction = best_model.predict(X_test)

        # Calculating accuracies
        training_accuracy = accuracy_score(Y_train, y_training_prediction)
        testing_accuracy = accuracy_score(Y_test, y_testing_prediction)
        print(f"  Best params: {grid_search.best_params_}")
        print(f"  Train Accuracy: {training_accuracy:.4f}")
        print(f"  Test Accuracy: {testing_accuracy:.4f}")
        print(f"  CV Score: {grid_search.best_score_:.4f}")

        # Store trial results
        result = {
            'partition': partition_name,
            'trial': trial_number + 1,
            'training_accuracy': training_accuracy,
            'testing_accuracy': testing_accuracy,
            'cv_score': grid_search.best_score_,
            'best_parameters': grid_search.best_params_,
        }
        current_partition_results.append(result)

    # Calculate averages for this partition
    avg_train = np.mean([r["training_accuracy"] for r in current_partition_results])
    avg_test = np.mean([r["testing_accuracy"] for r in current_partition_results])
    avg_cv = np.mean([r["cv_score"] for r in current_partition_results])

    avg_score_summary = {
        'partition': partition_name,
        'avg_train_accuracy': avg_train,
        'avg_test_accuracy': avg_test,
        'avg_cv_score': avg_cv,
        'trials': current_partition_results,
    }
    results.append(avg_score_summary)


PARTITION: 20/80 (Train/Test)
Trial number: 1
  Best params: {'classifier__max_depth': 15, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 150}
  Train Accuracy: 0.9262
  Test Accuracy: 0.8774
  CV Score: 0.8760
Trial number: 2
  Best params: {'classifier__max_depth': 15, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
  Train Accuracy: 0.9227
  Test Accuracy: 0.8751
  CV Score: 0.8773
Trial number: 3
  Best params: {'classifier__max_depth': 15, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
  Train Accuracy: 0.9202
  Test Accuracy: 0.8786
  CV Score: 0.8752

PARTITION: 50/50 (Train/Test)
Trial number: 1
  Best params: {'classifier__max_depth': 15, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2,

In [8]:
print(f"\n{'='*60}")
print("Final summary - Random Forest on Bank Marketing Dataset")
print("WITH CLASS BALANCING (class_weight='balanced')")
print(f"{'='*60}")

for result in results:
    print(f"Partition: {result['partition']}")
    print(f"  Avg Train Accuracy: {result['avg_train_accuracy']:.4f}")
    print(f"  Avg CV Accuracy: {result['avg_cv_score']:.4f}")
    print(f"  Avg Test Accuracy: {result['avg_test_accuracy']:.4f}")
    print()

compiled_results = []
for result in results:
    for partition in result["trials"]:
        compiled_results.append(partition)

random_forest_results_csv = pd.DataFrame(compiled_results)
random_forest_results_csv.to_csv("random_forest_results.csv", index=False)
print("results saved to random_forest_results.csv successfully!")


Final summary - Random Forest on Bank Marketing Dataset
WITH CLASS BALANCING (class_weight='balanced')
Partition: 20/80
  Avg Train Accuracy: 0.9230
  Avg CV Accuracy: 0.8762
  Avg Test Accuracy: 0.8770

Partition: 50/50
  Avg Train Accuracy: 0.9005
  Avg CV Accuracy: 0.8719
  Avg Test Accuracy: 0.8727

Partition: 80/20
  Avg Train Accuracy: 0.8900
  Avg CV Accuracy: 0.8694
  Avg Test Accuracy: 0.8703

results saved to random_forest_results.csv successfully!
