In [1]:
!pip install pandas numpy scikit-learn category_encoders



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import category_encoders as ce
import zipfile
import requests

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
response = requests.get(url)
with open("bank-additional.zip", "wb") as file:
    file.write(response.content)

In [4]:
with zipfile.ZipFile("bank-additional.zip", 'r') as z:
    with z.open('bank-additional/bank-additional.csv') as file:
        df = pd.read_csv(file, sep=';')

In [5]:
# Handling missing values 
df['age'].fillna(df['age'].mean(), inplace=True)

In [6]:
# Encode categorical variables using different methods (Example: Ordinal encoding and Binary encoding)
ordinal_encoder = ce.OrdinalEncoder(cols=['housing']) 
binary_encoder = ce.BinaryEncoder(cols=['loan'])  



In [7]:
df_encoded = ordinal_encoder.fit_transform(df)
df_encoded = binary_encoder.fit_transform(df_encoded)

In [8]:
# Split the data into features and labels
X = df_encoded.drop('default', axis=1)
y = df_encoded['default']

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
numerical_cols = ['age']  
categorical_cols =  ['housing', 'loan']    



In [11]:
# Perform feature scaling on numerical columns
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()


In [12]:
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [13]:
numerical_cols = ['age']  
categorical_cols = ['housing']  

In [14]:
selector_categorical = SelectKBest(score_func=chi2, k='all')  # Select the number of top categorical features you want to keep
X_train_categorical = selector_categorical.fit_transform(X_train[categorical_cols], y_train)
X_test_categorical = selector_categorical.transform(X_test[categorical_cols])

In [15]:
numerical_cols = ['age']

In [16]:
scaler = StandardScaler()
X_train_scaled = X_train[numerical_cols].copy()
X_test_scaled = X_test[numerical_cols].copy()


In [17]:
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [18]:
# Combine selected features
X_train_selected = np.hstack((X_train_scaled, X_train_categorical))
X_test_selected = np.hstack((X_test_scaled, X_test_categorical))

In [19]:
# Perform hyperparameter tuning using cross-validation
param_grid = {
    'n_estimators': [100, 200, 300],  # Specify the range of values to try for each hyperparameter
    'max_depth': [5, 10, None],
    # Add more hyperparameters and their respective values to tune
}

In [20]:
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

model.fit(X_train_selected, y_train)


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, None],
                         'n_estimators': [100, 200, 300]})

In [21]:
best_params = model.best_params_
print(f"Best Hyperparameters: {best_params}")

Best Hyperparameters: {'max_depth': 5, 'n_estimators': 100}


In [22]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores)}")

Cross-Validation Scores: [0.81183612 0.80880121 0.80880121 0.80728376 0.80728376]
Mean Cross-Validation Score: 0.8088012139605463


In [23]:
# Evaluate the model on the test set
y_pred = model.predict(X_test_selected)

In [24]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7864077669902912


In [25]:
class_report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{class_report}")

Classification Report:
              precision    recall  f1-score   support

          no       0.79      1.00      0.88       648
     unknown       0.50      0.01      0.01       175
         yes       0.00      0.00      0.00         1

    accuracy                           0.79       824
   macro avg       0.43      0.33      0.30       824
weighted avg       0.73      0.79      0.69       824



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
importances = model.best_estimator_.feature_importances_

# Print feature importances
for feature, importance in zip(X.columns, importances):
    print(f"{feature}: {importance}")



age: 0.9595498874751849
job: 0.040450112524815146


In [29]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{conf_matrix}")


Confusion Matrix:
[[647   1   0]
 [174   1   0]
 [  1   0   0]]


In [30]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision, recall, and F1-score for each class
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Print precision, recall, and F1-score for each class
for class_label, p, r, f in zip(range(len(precision)), precision, recall, f1):
    print(f"Class {class_label}:")
    print(f"Precision: {p}")
    print(f"Recall: {r}")
    print(f"F1-Score: {f}")
    print()


Class 0:
Precision: 0.7871046228710462
Recall: 0.9984567901234568
F1-Score: 0.8802721088435373

Class 1:
Precision: 0.5
Recall: 0.005714285714285714
F1-Score: 0.01129943502824859

Class 2:
Precision: 0.0
Recall: 0.0
F1-Score: 0.0



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
best_model = model.best_estimator_



In [34]:
mean_cv_score = np.mean(cv_scores)
print(f"Mean Cross-Validation Score: {mean_cv_score}")


Mean Cross-Validation Score: 0.8088012139605463


In [35]:
grid_search_results = model.cv_results_
print(f"Grid Search Results: {grid_search_results}")


Grid Search Results: {'mean_fit_time': array([0.12093349, 0.25216713, 0.36074767, 0.12595263, 0.24797845,
       0.37538271, 0.12472467, 0.24813666, 0.3899425 ]), 'std_fit_time': array([0.00961498, 0.02184924, 0.00691101, 0.00798987, 0.00749218,
       0.01458099, 0.00881711, 0.00326852, 0.00650786]), 'mean_score_time': array([0.01731691, 0.02539778, 0.03206921, 0.01310692, 0.02393231,
       0.0374845 , 0.01745963, 0.02499433, 0.03436661]), 'std_score_time': array([0.00443031, 0.00752712, 0.00205354, 0.00264335, 0.0070578 ,
       0.00765729, 0.00347329, 0.00765265, 0.00624994]), 'param_max_depth': masked_array(data=[5, 5, 5, 10, 10, 10, None, None, None],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[100, 200, 300, 100, 200, 300, 100, 200, 300],
             mask=[False, False, False, False, False, False, False, False,
                   F

In [36]:
y_pred_prob = model.predict_proba(X_test_selected)


In [39]:
best_score = model.best_score_
print(f"Best Mean Cross-Validation Score: {best_score}")


Best Mean Cross-Validation Score: 0.8091047040971169


In [40]:
best_params = model.best_params_
print(f"Best Hyperparameters: {best_params}")


Best Hyperparameters: {'max_depth': 5, 'n_estimators': 100}


In [41]:
grid_search_results = model.cv_results_
print(f"Grid Search Results: {grid_search_results}")


Grid Search Results: {'mean_fit_time': array([0.12093349, 0.25216713, 0.36074767, 0.12595263, 0.24797845,
       0.37538271, 0.12472467, 0.24813666, 0.3899425 ]), 'std_fit_time': array([0.00961498, 0.02184924, 0.00691101, 0.00798987, 0.00749218,
       0.01458099, 0.00881711, 0.00326852, 0.00650786]), 'mean_score_time': array([0.01731691, 0.02539778, 0.03206921, 0.01310692, 0.02393231,
       0.0374845 , 0.01745963, 0.02499433, 0.03436661]), 'std_score_time': array([0.00443031, 0.00752712, 0.00205354, 0.00264335, 0.0070578 ,
       0.00765729, 0.00347329, 0.00765265, 0.00624994]), 'param_max_depth': masked_array(data=[5, 5, 5, 10, 10, 10, None, None, None],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[100, 200, 300, 100, 200, 300, 100, 200, 300],
             mask=[False, False, False, False, False, False, False, False,
                   F

In [42]:
mean_train_score = model.best_estimator_.score(X_train_selected, y_train)
print(f"Mean Training Score: {mean_train_score}")


Mean Training Score: 0.8118361153262519


In [43]:
mean_test_score = model.best_estimator_.score(X_test_selected, y_test)
print(f"Mean Test Score: {mean_test_score}")


Mean Test Score: 0.7864077669902912


In [44]:
best_estimator_params = model.best_estimator_.get_params()
print(f"Parameters of the Best Estimator: {best_estimator_params}")


Parameters of the Best Estimator: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
