In [2]:
!pip install pandas numpy scikit-learn category_encoders



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import category_encoders as ce
import zipfile
import requests

In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
response = requests.get(url)
with open("bank-additional.zip", "wb") as file:
    file.write(response.content)

In [5]:
with zipfile.ZipFile("bank-additional.zip", 'r') as z:
    with z.open('bank-additional/bank-additional.csv') as file:
        df = pd.read_csv(file, sep=';')

In [7]:
# Handling missing values 
df['age'].fillna(df['age'].mean(), inplace=True)

In [8]:
# Encode categorical variables using different methods (Example: Ordinal encoding and Binary encoding)
ordinal_encoder = ce.OrdinalEncoder(cols=['housing']) 
binary_encoder = ce.BinaryEncoder(cols=['loan'])  



In [9]:
df_encoded = ordinal_encoder.fit_transform(df)
df_encoded = binary_encoder.fit_transform(df_encoded)

In [12]:
# Split the data into features and labels
X = df_encoded.drop('default', axis=1)
y = df_encoded['default']

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
numerical_cols = ['age']  
categorical_cols =  ['housing', 'loan']    



In [25]:
# Perform feature scaling on numerical columns
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()


In [26]:
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [27]:
numerical_cols = ['age']  
categorical_cols = ['housing']  

In [29]:
selector_categorical = SelectKBest(score_func=chi2, k='all')  # Select the number of top categorical features you want to keep
X_train_categorical = selector_categorical.fit_transform(X_train[categorical_cols], y_train)
X_test_categorical = selector_categorical.transform(X_test[categorical_cols])

In [30]:
numerical_cols = ['age']

In [31]:
scaler = StandardScaler()
X_train_scaled = X_train[numerical_cols].copy()
X_test_scaled = X_test[numerical_cols].copy()


In [32]:
X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [33]:
# Combine selected features
X_train_selected = np.hstack((X_train_scaled, X_train_categorical))
X_test_selected = np.hstack((X_test_scaled, X_test_categorical))

In [34]:
# Perform hyperparameter tuning using cross-validation
param_grid = {
    'n_estimators': [100, 200, 300],  # Specify the range of values to try for each hyperparameter
    'max_depth': [5, 10, None],
    # Add more hyperparameters and their respective values to tune
}

In [35]:
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

model.fit(X_train_selected, y_train)


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, None],
                         'n_estimators': [100, 200, 300]})

In [36]:
best_params = model.best_params_
print(f"Best Hyperparameters: {best_params}")

Best Hyperparameters: {'max_depth': 5, 'n_estimators': 200}


In [37]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores)}")

Cross-Validation Scores: [0.81183612 0.80880121 0.81031866 0.80728376 0.80728376]
Mean Cross-Validation Score: 0.8091047040971169


In [38]:
# Evaluate the model on the test set
y_pred = model.predict(X_test_selected)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7864077669902912


In [40]:
class_report = classification_report(y_test, y_pred)
print(f"Classification Report:\n{class_report}")

Classification Report:
              precision    recall  f1-score   support

          no       0.79      1.00      0.88       648
     unknown       0.50      0.01      0.01       175
         yes       0.00      0.00      0.00         1

    accuracy                           0.79       824
   macro avg       0.43      0.33      0.30       824
weighted avg       0.73      0.79      0.69       824



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
