In [34]:
import pandas as pd

df = pd.read_csv('data/bank-full.csv', sep=';')

df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [23]:
df.shape

(45211, 17)

### Check data types

In [24]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

### Check duplicated rows

In [25]:
df[df.duplicated()]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y


### Check missing values

In [26]:
df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [27]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df,title="Bank Marketing")

profile.to_file("banking_report.html")

100%|██████████| 17/17 [00:00<00:00, 24.65it/s]1<00:00, 12.44it/s, Describe variable: y]      
Summarize dataset: 100%|██████████| 75/75 [00:11<00:00,  6.71it/s, Completed]                 
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  4.00s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 77.21it/s]


### Drop 'Duration' Col

After initial analysis there is no point in training model on this variable, since  we want to predict whether the customer will open lokata before we call them.

In [35]:
df = df.drop(columns='duration')

df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

### Datatype Conversion

Now we convert the data types for model to handle.

Rule of conversion:
1. Numeric - no conversion
2. Categorical - preserve relationships where necessary (e.g month, but not job)
3. Boolean - convert to binary

In [36]:
# categorical
    # Education mapping
edu_map = {'unknown': 0, 'primary': 1, 'secondary': 2, 'tertiary': 3}
df['education'] = df['education'].map(edu_map)

    # Month mapping
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
month_map = {m: i+1 for i, m in enumerate(months)}
df['month'] = df['month'].map(month_map)

# One Hot encoding
df = pd.get_dummies(df, columns=['job', 'marital', 'contact', 'poutcome'], drop_first=True)

# binary
binary_cols = ['default', 'housing', 'loan', 'y']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

df.dtypes # print result

age                  int64
education            int64
default              int64
balance              int64
housing              int64
loan                 int64
day                  int64
month                int64
campaign             int64
pdays                int64
previous             int64
y                    int64
job_blue-collar       bool
job_entrepreneur      bool
job_housemaid         bool
job_management        bool
job_retired           bool
job_self-employed     bool
job_services          bool
job_student           bool
job_technician        bool
job_unemployed        bool
job_unknown           bool
marital_married       bool
marital_single        bool
contact_telephone     bool
contact_unknown       bool
poutcome_other        bool
poutcome_success      bool
poutcome_unknown      bool
dtype: object

### One More Profiling

Let's create one more report to see how the data changed, mostly visually, since we have more columns now, and how the alerts look like

In [30]:
profile = ProfileReport(df,title="Bank Marketing 2")

profile.to_file("banking_report_v2.html")

100%|██████████| 31/31 [00:00<00:00, 313.87it/s]<00:00, 69.97it/s, Describe variable: poutcome_unknown] 
Summarize dataset: 100%|██████████| 104/104 [00:14<00:00,  7.39it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.70s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.25it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 59.61it/s]


In [37]:
# Check unique values to spot differences
print("Education unique values:", df['education'].unique())
print("Month unique values:", df['month'].unique())

df.head()

Education unique values: [3 2 0 1]
Month unique values: [ 5  6  7  8 10 11 12  1  2  3  4  9]


Unnamed: 0,age,education,default,balance,housing,loan,day,month,campaign,pdays,...,job_technician,job_unemployed,job_unknown,marital_married,marital_single,contact_telephone,contact_unknown,poutcome_other,poutcome_success,poutcome_unknown
0,58,3,0,2143,1,0,5,5,1,-1,...,False,False,False,True,False,False,True,False,False,True
1,44,2,0,29,1,0,5,5,1,-1,...,True,False,False,False,True,False,True,False,False,True
2,33,2,0,2,1,1,5,5,1,-1,...,False,False,False,True,False,False,True,False,False,True
3,47,0,0,1506,1,0,5,5,1,-1,...,False,False,False,True,False,False,True,False,False,True
4,33,0,0,1,0,0,5,5,1,-1,...,False,False,True,False,True,False,True,False,False,True


In [38]:
df = df.drop(columns='marital_single')
df.dtypes

age                  int64
education            int64
default              int64
balance              int64
housing              int64
loan                 int64
day                  int64
month                int64
campaign             int64
pdays                int64
previous             int64
y                    int64
job_blue-collar       bool
job_entrepreneur      bool
job_housemaid         bool
job_management        bool
job_retired           bool
job_self-employed     bool
job_services          bool
job_student           bool
job_technician        bool
job_unemployed        bool
job_unknown           bool
marital_married       bool
contact_telephone     bool
contact_unknown       bool
poutcome_other        bool
poutcome_success      bool
poutcome_unknown      bool
dtype: object

### EDA Summary

The dataset is pretty clean for modelling. 'Duration' column was dropped to prevent data leak'age. There were no null values. Outliers were not analyzed nor handled. The variable 'y' will be hanlded in thr iraining loop and in the model testing - as we will use precision instead of accuracy. 'marital_single' was dropped due to presence of 'marital_married', which couples singles with divorced people but it should be fine.

# Preprocessing

In [40]:
features = df.drop(columns='y')
target = df['y']

In [41]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)



In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(class_weight='balanced', random_state=42)