In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
from sklearn.tree import export_graphviz
from IPython.display import SVG

# You may need to install the Python graphviz library. At the command line:
#   pip install graphviz
# You will also need to install the graphviz executables. You can use apt,
# macports, or other installer for your system.
from graphviz import Source

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("bank.csv")
df['deposit'] = df['deposit'].map({'no': 0, 'yes': 1})

## Is there something wrong with Month??

In [None]:
# List of months
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

# Iterate over each month
for month in months:
    # Create a new column for each month, named as 'month_<month>'
    df['month_' + month] = df['month'].apply(lambda x: 1 if x.lower() == month else 0)

# Drop the original 'month' column
df.drop(columns=["month"], inplace=True)

for val in df["month_mar"].unique():
    print(val)

## Handle Binary Features

In [None]:
df['default'] = df['default'].map({'no': 0, 'yes': 1})
df['housing'] = df['housing'].map({'no': 0, 'yes': 1})
df['loan'] = df['loan'].map({'no': 0, 'yes': 1})

## One Hot Encode and Prepare the data

In [None]:
X = df.drop('deposit', axis=1)  # Drop the target variable to get the features
y = df['deposit']  # Select only the target variable

# Display the original DataFrame
# print("Original DataFrame:")
# print(df.head())
display(X.columns)
display(X.head())

# Perform one-hot encoding on the features
X = pd.get_dummies(X, dtype=int)

# campaign is a categorical variable, so we need to perform one-hot encoding on it
campaign = pd.get_dummies(X.campaign, prefix='campaign', dtype=int)
dropped = X.drop('campaign', axis=1)

# combine the one-hot encoded campaign with the original features
X = pd.concat([dropped, campaign], axis=1)
continuous_features = X[['age','balance', 'day', 'duration', 'pdays']]

for column in continuous_features:
    X[column] = (X[column] - X[column].mean()) / X[column].std()

X.head()

In [None]:
print(X.columns)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

## Is there something wrong with march?

In [None]:
for val in X["month_mar"].unique():
    print(val)

In [None]:
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X.columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(X.values, y)
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X.columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X.values, y)
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X.columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

## Visualize the output

In [None]:
target_variable = 'deposit'
# Plot scatter plots for each continuous feature
for feature in continuous_features:
    plt.figure(figsize=(8, 6))
    plt.scatter(X[feature], y, alpha=0.5)
    plt.title(f'Scatter Plot of {feature} vs {target_variable}')
    plt.xlabel(feature)
    plt.ylabel(target_variable)
    plt.grid(True)
    plt.show()

In [None]:
df = pd.concat([X, y], axis=1)
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

## Remove lots of features that aren't in the decision tree to help when I want to regraph the heatmap

In [None]:
# Get a list of column names starting with "campaign_"
campaign_columns = df.filter(regex='^campaign_', axis=1).columns

# Drop the campaign columns from the DataFrame
df = df.drop(campaign_columns, axis=1)

# Get a list of column names starting with "job_"
job_columns = df.filter(regex='^job_', axis=1).columns

# Drop the job columns from the DataFrame
df = df.drop(job_columns, axis=1)

In [None]:
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

## Make a Heatmap of just the months compared with the target variable

In [None]:
# Get a list of column names starting with "month_"
month_columns = df.filter(regex='^month_', axis=1).columns

# Extract the target variable "deposit" along with the month columns
heatmap_df = df[['deposit'] + list(month_columns)]

# Calculate the correlation matrix
correlation_matrix = heatmap_df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of "deposit" and "month_" Columns')
plt.show()

## Visualize The Months Correlations With Deposit

In [None]:
for val in X["month_mar"]:
    print(val)

In [None]:
# Get a list of column names starting with "month_"
month_columns = df.filter(regex='^month_', axis=1).columns

# Drop the month columns from the DataFrame
df = df.drop(month_columns, axis=1)

In [None]:
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

## Make a heatmap to show the different continuous features coorelations to the target variable

In [None]:
corr_matrix = df[['age', 'balance', 'day', 'duration', 'pdays', 'deposit']].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Heatmap of All Columns In Dataframe')
plt.show()

Duration: Duration of the last contact to the potential person who will make a deposit in the bank...

## Make A decision tree with just duration

In [None]:
treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(X[["duration"]].values, y)
classNames = y.unique().astype(str)
dot = tree.export_graphviz(treeclf, out_file=None,
                           feature_names=X[["duration"]].columns,
                           class_names=classNames, 
                           filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)