In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
from sklearn.tree import export_graphviz
from IPython.display import SVG
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from graphviz import Source

import matplotlib.pyplot as plt
import seaborn as sns

Link to the dataset: https://www.kaggle.com/datasets/janiobachmann/bank-marketing-dataset

In [None]:
df = pd.read_csv("bank.csv")
df['deposit'] = df['deposit'].map({'no': 0, 'yes': 1})

## Is there something wrong with Month??

In [None]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
for month in months:
    df['month_' + month] = df['month'].apply(lambda x: 1 if x.lower() == month else 0)
df.drop(columns=["month"], inplace=True)
for val in df["month_mar"].unique():
    print(val)

## Handle Binary Features

In [None]:
df['default'] = df['default'].map({'no': 0, 'yes': 1})
df['housing'] = df['housing'].map({'no': 0, 'yes': 1})
df['loan'] = df['loan'].map({'no': 0, 'yes': 1})

## One Hot Encode and Prepare the data

In [None]:
X = df.drop('deposit', axis=1)
y = df['deposit'] 

display(X.columns)
display(X.head())

X = pd.get_dummies(X, dtype=int)

# campaign is a categorical variable, so we need to perform one-hot encoding on it
campaign = pd.get_dummies(X.campaign, prefix='campaign', dtype=int)
dropped = X.drop('campaign', axis=1)

# combine the one-hot encoded campaign with the original features
X = pd.concat([dropped, campaign], axis=1)
continuous_features = X[['age','balance', 'day', 'duration', 'pdays']]

for column in continuous_features:
    X[column] = (X[column] - X[column].mean()) / X[column].std()

X.head()

In [None]:
print(X.columns)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

In [None]:
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X.columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(X.values, y)
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X.columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X.values, y)
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X.columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

## Visualize the output

In [None]:
target_variable = 'deposit'
for feature in continuous_features:
    plt.figure(figsize=(8, 6))
    plt.scatter(X[feature], y, alpha=0.5)
    plt.title(f'Scatter Plot of {feature} vs {target_variable}')
    plt.xlabel(feature)
    plt.ylabel(target_variable)
    plt.grid(True)
    plt.show()

In [None]:
df = pd.concat([X, y], axis=1)
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

## Remove lots of features that aren't in the decision tree to help when I want to regraph the heatmap since the above graph is unreadable

In [None]:
campaign_columns = df.filter(regex='^campaign_', axis=1).columns
df = df.drop(campaign_columns, axis=1)
job_columns = df.filter(regex='^job_', axis=1).columns
df = df.drop(job_columns, axis=1)

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

## Make a Heatmap of just the months compared with the target variable since tyhe above heatmaps were clearly too complicated

In [None]:
month_columns = df.filter(regex='^month_', axis=1).columns
heatmap_df = df[['deposit'] + list(month_columns)]
correlation_matrix = heatmap_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of "deposit" and "month_" Columns')
plt.show()

## Visualize Heapmap without The Month Correlations With Deposit because we don't want it to be understandable

In [None]:
month_columns = df.filter(regex='^month_', axis=1).columns
df = df.drop(month_columns, axis=1)

In [None]:
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

## Make a heatmap to show the different continuous features coorelations to the target variable since heatmapping too many variable again failed miserably

In [None]:
corr_matrix = df[['age', 'balance', 'day', 'duration', 'pdays', 'deposit']].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

Duration: Duration of the last contact to the potential person who will make a deposit in the bank...

## Make A decision tree with just duration

In [None]:
treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(X[["duration"]].values, y)
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X[["duration"]].columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

# Neural Networks

In [None]:
df = pd.read_csv("bank.csv")
df['deposit'] = df['deposit'].map({'no': 0, 'yes': 1})

In [None]:
X = df.drop('deposit', axis=1)  # Drop the target variable to get the features
y = df['deposit']  # Select only the target variable

# Display the original DataFrame
# print("Original DataFrame:")
# print(df.head())
display(X.columns)
display(X.head())



# Perform one-hot encoding on the features
X = pd.get_dummies(X, dtype=int)
X = X.drop('campaign', axis=1)

# uncomment this code if you want to try encoding with the campaign
# # campaign is a categorical variable, so we need to perform one-hot encoding on it
# campaign = pd.get_dummies(X.campaign, prefix='campaign', dtype=int)
# dropped = X.drop('campaign', axis=1)

# # combine the one-hot encoded campaign with the original features
# X = pd.concat([dropped, campaign], axis=1)



continuous_features = X[['age','balance', 'day', 'duration', 'pdays']]

for column in continuous_features:
    X[column] = (X[column] - X[column].mean()) / X[column].std()

X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

mlp = MLPClassifier(hidden_layer_sizes=(10),max_iter=500) # 1 layer, 5 nodes

mlp.fit(X_train, y_train)

predictions = mlp.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print('This dataset has {} input nodes and {} output node(s)'.format(len(X.columns), len(y.unique())))
print('There are {} 2D arrays of coefficients, one for each layer'.format(len(mlp.coefs_)))
print('The layers have the following number of coefficients: {}')
for l in range(len(mlp.coefs_)):
    m = len(mlp.coefs_[l])
    n = len(mlp.coefs_[l][0])
    print('  {}: {}x{} ({} nodes feeding into a layer of {} nodes)'.format(l, m, n, m, n))
# Print the actual coefficients
# print(mlp.coefs_)

print()
print('There are {} 1D arrays of intercepts, one for each layer'.format(len(mlp.intercepts_)))
print('Each layer has {} intercepts, one for each node'.format([len(mlp.intercepts_[l]) for l,_ in enumerate(mlp.intercepts_)]))


In [25]:
# you may need to install networkx with pip
import networkx as nx
import colorsys

def show_ann(mlp):
    hidden_layers_n = len(mlp.coefs_)-1
    layers_n = hidden_layers_n + 2
    input_neurons_n = len(mlp.coefs_[0])
    hidden_neurons_n = [len(mlp.coefs_[i+1]) for i in range(hidden_layers_n)]
    output_neurons_n = len(mlp.coefs_[-1][0])

    G = nx.DiGraph()
    pos = {}

    # Create the neurons of the input layer
    for i in range(input_neurons_n):
        pos['Layer0_{}'.format(i)] = (i,layers_n-1)

    for j in range(hidden_layers_n):
        # Create the neurons of the j'th hidden layer
        prev_layer = j
        cur_layer = j+1
        if (j == 0):
            prev_size = input_neurons_n
        else:
            prev_size = hidden_neurons_n[j-1]
        for i in range(hidden_neurons_n[j]):
            pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
            for k in range(prev_size):
                w = mlp.coefs_[prev_layer][k][i]
                G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    # Create the neurons of the output layer
    prev_layer = hidden_layers_n
    cur_layer = hidden_layers_n+1
    for i in range(output_neurons_n):
        pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
        for k in range(hidden_neurons_n[-1]):
            w = mlp.coefs_[prev_layer][k][i]
            G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    edges = G.edges()
    colors = [colorsys.hsv_to_rgb(0 if G[u][v]['weight'] < 0 else 0.65,
                                  1,#min(1, abs(G[u][v]['weight'])),
                                  1) for u,v in edges]
    weights = [abs(G[u][v]['weight'])*2 for u,v in edges]

    nx.draw(G, pos, node_color='y', node_size=450, width=weights, edge_color=colors)
    
show_ann(mlp)

TypeError: '_AxesStack' object is not callable

<Figure size 432x288 with 0 Axes>

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(10, 10),max_iter=500) # 1 layer, 5 nodes

mlp.fit(X_train, y_train)

predictions = mlp.predict(X_test)

print(classification_report(y_test,predictions))

In [None]:
print('This dataset has {} input nodes and {} output node(s)'.format(len(X.columns), len(y.unique())))
print('There are {} 2D arrays of coefficients, one for each layer'.format(len(mlp.coefs_)))
print('The layers have the following number of coefficients: {}')
for l in range(len(mlp.coefs_)):
    m = len(mlp.coefs_[l])
    n = len(mlp.coefs_[l][0])
    print('  {}: {}x{} ({} nodes feeding into a layer of {} nodes)'.format(l, m, n, m, n))
# Print the actual coefficients
# print(mlp.coefs_)

print()
print('There are {} 1D arrays of intercepts, one for each layer'.format(len(mlp.intercepts_)))
print('Each layer has {} intercepts, one for each node'.format([len(mlp.intercepts_[l]) for l,_ in enumerate(mlp.intercepts_)]))

In [None]:
show_ann(mlp)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(20, 10, 5),max_iter=500) # 1 layer, 5 nodes

mlp.fit(X_train, y_train)

predictions = mlp.predict(X_test)

print(classification_report(y_test,predictions))

In [None]:
print('This dataset has {} input nodes and {} output node(s)'.format(len(X.columns), len(y.unique())))
print('There are {} 2D arrays of coefficients, one for each layer'.format(len(mlp.coefs_)))
print('The layers have the following number of coefficients: {}')
for l in range(len(mlp.coefs_)):
    m = len(mlp.coefs_[l])
    n = len(mlp.coefs_[l][0])
    print('  {}: {}x{} ({} nodes feeding into a layer of {} nodes)'.format(l, m, n, m, n))
# Print the actual coefficients
# print(mlp.coefs_)

print()
print('There are {} 1D arrays of intercepts, one for each layer'.format(len(mlp.intercepts_)))
print('Each layer has {} intercepts, one for each node'.format([len(mlp.intercepts_[l]) for l,_ in enumerate(mlp.intercepts_)]))

In [None]:
show_ann(mlp)