In [None]:
import pandas as pd
import numpy as np

file_path = '/content/bank-full.csv'
raw = pd.read_csv(file_path, delimiter=';')

raw

In [None]:
# Separate numerical and object columns
numerical_data = raw.select_dtypes(include=['int64', 'float64'])
categorical_data = raw.select_dtypes(include=['object'])

# Describe numerical data
numerical_description = numerical_data.describe()

# Describe categorical data
categorical_description = categorical_data.describe()

# Display the descriptions
print("Numerical Data Description:")
print(numerical_description)

print("\nCategorical Data Description:")
print(categorical_description)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the figure
plt.figure(figsize=(10, 12))

# Plot each numerical feature
for i, feature in enumerate(numerical_data, 1):
    plt.subplot(4, 2, i)
    sns.histplot(raw[feature], kde=True, color='blue', bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Visualize categorical data distributions
plt.figure(figsize=(12, 18))
for i, feature in enumerate(categorical_data.columns, 1):
    plt.subplot((len(categorical_data.columns) + 1) // 2, 2, i)
    sns.countplot(y=feature, data=raw, hue=feature, palette='viridis', legend=False)
    plt.title(f'Distribution of {feature}')
    plt.xlabel('Frequency')
    plt.ylabel(feature)
plt.tight_layout()
plt.show()

In [3]:
# Step 3: Drop columns.
# For benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
columns_to_drop = ['duration', 'default']
data = raw.drop(columns=columns_to_drop)

In [None]:
# Check for duplicate rows
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

In [None]:
# Remove duplicate rows
data = data.drop_duplicates()

In [None]:
print(f"Shape of the dataset after removing duplicates: {data.shape}")

In [None]:
print(f"\nMissing values per column:")
print(data.isnull().sum())

In [None]:
print("Dataset Info:")
data.info()

In [None]:
# Bin 'age' into categories
bins_age = [0, 25, 50, 75, 100]
labels_age = ['youth', 'adult', 'senior', 'elderly']
data['age_group'] = pd.cut(data['age'], bins=bins_age, labels=labels_age, right=False)

# Bin 'balance' into categories
bins_balance = [-np.inf, 0, 1000, 5000, np.inf]
labels_balance = ['negative', 'low', 'medium', 'high']
data['balance_category'] = pd.cut(data['balance'], bins=bins_balance, labels=labels_balance, right=False)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix for numerical columns
correlation_matrix = numerical_data.corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", cbar=True)
plt.title("Heatmap of Correlation Matrix")
plt.show()


In [None]:
# Select all columns with categorical data (dtype = object), excluding the target variable ('y').
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.drop('y')  # Exclude the target variable
print(f"\nCategorical Columns: {categorical_cols}")

for col in categorical_cols:
    print(f"\nUnique values in '{col}': {data[col].unique()}")

In [None]:
x = raw.drop(['y'], axis=1)
y = raw['y']
y.value_counts()

In [None]:
import matplotlib.pyplot as plt

y.value_counts().plot.pie(autopct='%.2f')

In [None]:
# Define a function to detect outliers using the IQR method for a given column
def detect_outliers_iqr_all(df):
    outliers = {}
    for column in df.select_dtypes(include=['int64', 'float64']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers in the column
        column_outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outliers[column] = column_outliers
        print(f"Outliers in {column}:")
        print(column_outliers)
        print("\n")

    return outliers

# Apply the function to the dataset
outliers_dict = detect_outliers_iqr_all(data)


In [None]:
data.info()

In [None]:
!pip install -U scikit-learn
!pip install -U imbalanced-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Step 1: Label Encoding for Random Forest
label_encoded_data = data.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    label_encoded_data[col] = le.fit_transform(data[col])  # Transform to numeric labels
    label_encoders[col] = le

# Encode the target variable ('y') for Random Forest
label_encoder_y = LabelEncoder()
label_encoded_data['y'] = label_encoder_y.fit_transform(data['y'])

# Step 2: One-Hot Encoding for Neural Networks
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[('onehot', one_hot_encoder, categorical_cols)],
    remainder='passthrough'
)

# Apply the preprocessor to the dataset
one_hot_encoded_data = preprocessor.fit_transform(data)

# Retrieve the fitted OneHotEncoder and get feature names
fitted_one_hot_encoder = preprocessor.named_transformers_['onehot']
one_hot_encoded_feature_names = fitted_one_hot_encoder.get_feature_names_out(categorical_cols)

# Combine feature names and convert transformed data to DataFrame
all_feature_names = list(one_hot_encoded_feature_names) + list(data.drop(categorical_cols, axis=1).columns)
one_hot_encoded_data = pd.DataFrame(one_hot_encoded_data, columns=all_feature_names)

# Encode the target variable ('y') for Neural Networks
one_hot_encoded_data['y'] = label_encoder_y.transform(data['y'])

# Step 3: Feature Scaling for Neural Networks
scaler = StandardScaler()
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
scaled_data = one_hot_encoded_data.copy()
scaled_data[numerical_cols] = scaler.fit_transform(one_hot_encoded_data[numerical_cols])

# Step 4: Train-Test Split
X_rf = label_encoded_data.drop('y', axis=1)  # Features for Random Forest
y_rf = label_encoded_data['y']              # Target variable for Random Forest

X_nn = scaled_data.drop('y', axis=1)  # Features for Neural Networks
y_nn = scaled_data['y']              # Target variable for Neural Networks

X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(
    X_rf, y_rf, test_size=0.2, random_state=42, stratify=y_rf)
X_nn_train, X_nn_test, y_nn_train, y_nn_test = train_test_split(
    X_nn, y_nn, test_size=0.2, random_state=42, stratify=y_nn)

# Step 1: Undersample the Majority Class
under_sampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)  # Reduce majority to 50% of its size
X_rf_train_under, y_rf_train_under = under_sampler.fit_resample(X_rf_train, y_rf_train)
X_nn_train_under, y_nn_train_under = under_sampler.fit_resample(X_nn_train, y_nn_train)


# Step 2: Oversample the Minority Class
smote = SMOTE(sampling_strategy=1.0, random_state=42)  # Make both classes equal
X_rf_train_balanced, y_rf_train_balanced = smote.fit_resample(X_rf_train_under, y_rf_train_under)
X_nn_train_balanced, y_nn_train_balanced = smote.fit_resample(X_nn_train_under, y_nn_train_under)

# Step 6: Display Results
print(f"\nRandom Forest Training Set Class Distribution After Undersampling:")
print(pd.Series(y_rf_train_balanced).value_counts(normalize=True))

print("\nNeural Network Training Set Class Distribution After Undersampling:")
print(pd.Series(y_nn_train_balanced).value_counts(normalize=True))

print("\nShapes After Undersampling:")
print("Random Forest Train Shape:", X_rf_train_balanced.shape)
print("Random Forest Test Shape:", X_rf_test.shape)
print("Neural Network Train Shape:", X_nn_train_balanced.shape)
print("Neural Network Test Shape:", X_nn_test.shape)


In [None]:
# Combine features and target variable into a single DataFrame
rf_balanced_data = pd.concat([X_rf_train_balanced, y_rf_train_balanced], axis=1)
nn_balanced_data = pd.concat([X_nn_train_balanced, y_nn_train_balanced], axis=1)

# Check for duplicates in the Random Forest balanced dataset
rf_duplicates = rf_balanced_data.duplicated().sum()
print(f"Number of duplicate rows in Random Forest balanced dataset: {rf_duplicates}")

# Check for duplicates in the Neural Network balanced dataset
nn_duplicates = nn_balanced_data.duplicated().sum()
print(f"Number of duplicate rows in Neural Network balanced dataset: {nn_duplicates}")


In [None]:
# Remove duplicates for Random Forest
rf_balanced_data = rf_balanced_data.drop_duplicates().reset_index(drop=True)
X_rf_train_balanced = rf_balanced_data.drop('y', axis=1)
y_rf_train_balanced = rf_balanced_data['y']

# Remove duplicates for Neural Networks
nn_balanced_data = nn_balanced_data.drop_duplicates().reset_index(drop=True)
X_nn_train_balanced = nn_balanced_data.drop('y', axis=1)
y_nn_train_balanced = nn_balanced_data['y']

# Display the results
print(f"Random Forest Training Set Shape After Cleaning: {rf_balanced_data.shape}")
print(f"Neural Network Training Set Shape After Cleaning: {nn_balanced_data.shape}")


In [None]:
# Random Forest preprocessed datasets
rf_train_data = pd.concat([pd.DataFrame(X_rf_train_balanced), pd.Series(y_rf_train_balanced, name='y')], axis=1)
rf_test_data = pd.concat([pd.DataFrame(X_rf_test), pd.Series(y_rf_test, name='y')], axis=1)

# Save Random Forest datasets to CSV
rf_train_data.to_csv("/content/rf_train_data.csv", index=False)
rf_test_data.to_csv("/content/rf_test_data.csv", index=False)

# Neural Network preprocessed datasets
nn_train_data = pd.concat([pd.DataFrame(X_nn_train_balanced), pd.Series(y_nn_train_balanced, name='y')], axis=1)
nn_test_data = pd.concat([pd.DataFrame(X_nn_test), pd.Series(y_nn_test, name='y')], axis=1)

# Save Neural Network datasets to CSV
nn_train_data.to_csv("/content/nn_train_data.csv", index=False)
nn_test_data.to_csv("/content/nn_test_data.csv", index=False)

print("\nCSV files saved in the 'output' directory:")
print("- Random Forest Training Data: /content/rf_train_data.csv")
print("- Random Forest Testing Data: /content/rf_test_data.csv")
print("- Neural Network Training Data: /content/nn_train_data.csv")
print("- Neural Network Testing Data: /content/nn_test_data.csv")


In [None]:
# rf_train_data = pd.read_csv("/content/rf_train_data.csv")

x = rf_train_data.drop(['y'], axis=1)
y = rf_train_data['y']
y.value_counts()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay

# Step 1: Load Data from CSV Files
rf_train_data = pd.read_csv("/content/rf_train_data.csv")
rf_test_data = pd.read_csv("/content/rf_test_data.csv")
# Step 2: Separate Features and Target for Random Forest
X_rf_train_balanced = rf_train_data.drop('y', axis=1)
y_rf_train_balanced = rf_train_data['y']
X_rf_test = rf_test_data.drop('y', axis=1)
y_rf_test = rf_test_data['y']

# Step 3: Train Random Forest Classifier
rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42
)

rf_clf.fit(X_rf_train_balanced, y_rf_train_balanced)

# Predictions and Probabilities on Training Set
y_rf_train_pred = rf_clf.predict(X_rf_train_balanced)
y_rf_train_pred_prob = rf_clf.predict_proba(X_rf_train_balanced)[:, 1]

# Predictions and Probabilities on Test Set
y_rf_pred = rf_clf.predict(X_rf_test)
y_rf_pred_prob = rf_clf.predict_proba(X_rf_test)[:, 1]

# Step 4: Evaluate Model Performance

# Training Metrics
print("\nRandom Forest Training Metrics:")
print("Classification Report:")
print(classification_report(y_rf_train_balanced, y_rf_train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_rf_train_balanced, y_rf_train_pred))
print("Accuracy:", accuracy_score(y_rf_train_balanced, y_rf_train_pred))
print("ROC AUC Score:", roc_auc_score(y_rf_train_balanced, y_rf_train_pred_prob))

# Test Metrics
print("\nRandom Forest Test Metrics:")
print("Classification Report:")
print(classification_report(y_rf_test, y_rf_pred))
print("Accuracy:", accuracy_score(y_rf_test, y_rf_pred))
print("ROC AUC Score:", roc_auc_score(y_rf_test, y_rf_pred_prob))


In [None]:
# Random Forest Confusion Matrix
ConfusionMatrixDisplay.from_estimator(rf_clf, X_rf_test, y_rf_test)
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Compute ROC curve and AUC for Training Data
fpr_train, tpr_train, _ = roc_curve(y_rf_train_balanced, y_rf_train_pred_prob)
roc_auc_train = roc_auc_score(y_rf_train_balanced, y_rf_train_pred_prob)

# Compute ROC curve and AUC for Testing Data
fpr_test, tpr_test, _ = roc_curve(y_rf_test, y_rf_pred_prob)
roc_auc_test = roc_auc_score(y_rf_test, y_rf_pred_prob)

# Plotting the ROC Curves
plt.figure(figsize=(10, 6))
plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {roc_auc_train:.2f})', color='blue')
plt.plot(fpr_test, tpr_test, label=f'Testing ROC Curve (AUC = {roc_auc_test:.2f})', color='green')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess (AUC = 0.5)')

plt.title('ROC Curve for Random Forest')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend(loc='lower right')
plt.grid()
plt.show()


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
import datetime

# Step 1: Load Data from CSV Files
nn_train_data = pd.read_csv("/content/nn_train_data.csv")
nn_test_data = pd.read_csv("/content/nn_test_data.csv")

# Step 2: Separate Features and Target
X_nn_train_balanced = nn_train_data.drop('y', axis=1)
y_nn_train_balanced = nn_train_data['y']
X_nn_test = nn_test_data.drop('y', axis=1)
y_nn_test = nn_test_data['y']
# Advanced Neural Network Model

nn_model = Sequential([
    Input(shape=(X_nn_train_balanced.shape[1],)),  # Input layer

    Dense(256),  # First hidden layer
    BatchNormalization(),
    LeakyReLU(negative_slope=0.1),
    Dropout(0.3),

    Dense(128),
    BatchNormalization(),
    LeakyReLU(negative_slope=0.1),
    Dropout(0.3),

    Dense(64),
    BatchNormalization(),
    LeakyReLU(negative_slope=0.1),
    Dropout(0.3),

    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# TensorBoard Logging
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Train the model
history = nn_model.fit(
    X_nn_train_balanced, y_nn_train_balanced,
    epochs=20,
    batch_size=32,
    validation_data=(X_nn_test, y_nn_test),
    verbose=1
)

# Predictions
y_nn_pred_prob = nn_model.predict(X_nn_test).flatten()
y_nn_pred = (y_nn_pred_prob > 0.5).astype(int)

# Predictions on Training Data
y_nn_train_pred_prob = nn_model.predict(X_nn_train_balanced).flatten()
y_nn_train_pred = (y_nn_train_pred_prob > 0.5).astype(int)

# Training Data Metrics
print("\nNeural Network Classification Report (Training Data):")
print(classification_report(y_nn_train_balanced, y_nn_train_pred))
print("Accuracy (Training Data):", accuracy_score(y_nn_train_balanced, y_nn_train_pred))
print("ROC AUC Score (Training Data):", roc_auc_score(y_nn_train_balanced, y_nn_train_pred_prob))

# Evaluation Metrics
print("\nNeural Network Classification Report:")
print(classification_report(y_nn_test, y_nn_pred))
print("Accuracy:", accuracy_score(y_nn_test, y_nn_pred))
print("ROC AUC Score:", roc_auc_score(y_nn_test, y_nn_pred_prob))


In [None]:
# Neural Network
sns.heatmap(confusion_matrix(y_nn_test, y_nn_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Neural Network Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import roc_curve

# ROC Curve for Training Data
fpr_train, tpr_train, _ = roc_curve(y_nn_train_balanced, y_nn_train_pred_prob)
roc_auc_train = roc_auc_score(y_nn_train_balanced, y_nn_train_pred_prob)

# ROC Curve for Testing Data
fpr_test, tpr_test, _ = roc_curve(y_nn_test, y_nn_pred_prob)
roc_auc_test = roc_auc_score(y_nn_test, y_nn_pred_prob)

# Plotting the ROC Curves
plt.figure(figsize=(10, 6))
plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {roc_auc_train:.2f})', color='blue')
plt.plot(fpr_test, tpr_test, label=f'Testing ROC Curve (AUC = {roc_auc_test:.2f})', color='green')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess (AUC = 0.5)')

plt.title('ROC Curve Of Neural Network')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.legend(loc='lower right')
plt.grid()
plt.show()


In [None]:
# Plot Learning Curves
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Evaluation metrics (replace with your actual data)
rf_eval = {
    'acc': accuracy_score(y_rf_test, y_rf_pred),
    'prec': classification_report(y_rf_test, y_rf_pred, output_dict=True)['1']['precision'],
    'rec': classification_report(y_rf_test, y_rf_pred, output_dict=True)['1']['recall'],
    'f1': classification_report(y_rf_test, y_rf_pred, output_dict=True)['1']['f1-score']
}

nn_eval = {
    'acc': accuracy_score(y_nn_test, y_nn_pred),
    'prec': classification_report(y_nn_test, y_nn_pred, output_dict=True)['1']['precision'],
    'rec': classification_report(y_nn_test, y_nn_pred, output_dict=True)['1']['recall'],
    'f1': classification_report(y_nn_test, y_nn_pred, output_dict=True)['1']['f1-score']
}

# Metrics and scores
metrics = ['Accuracy', 'Precision', 'Recall', 'F1']
rf_scores = [rf_eval['acc'], rf_eval['prec'], rf_eval['rec'], rf_eval['f1']]
nn_scores = [nn_eval['acc'], nn_eval['prec'], nn_eval['rec'], nn_eval['f1']]

# Bar chart for evaluation metrics
barWidth = 0.3
r1 = np.arange(len(metrics))
r2 = [x + barWidth for x in r1]

plt.figure(figsize=(10, 6))
plt.bar(r1, rf_scores, width=barWidth, edgecolor='white', label='Random Forest')
plt.bar(r2, nn_scores, width=barWidth, edgecolor='white', label='Neural Network')

# Configure axes
plt.xlabel('Metrics', fontweight='bold')
plt.xticks([r + barWidth / 2 for r in range(len(metrics))], metrics)
plt.ylabel('Score', fontweight='bold')
plt.ylim(0, 1)
plt.title('Model Comparison: Evaluation Metrics', fontsize=14, fontweight='bold')
plt.legend()

plt.tight_layout()
plt.show()
