### Cleaning of data

In [2]:
import pandas as pd
import numpy as np

In [3]:
bank_df = pd.read_csv('bank_marketing.csv')
bank_df.head()

Unnamed: 0,client_id,age,job,marital,education,credit_default,mortgage,month,day,contact_duration,number_contacts,previous_campaign_contacts,previous_outcome,cons_price_idx,euribor_three_months,campaign_outcome
0,0,56,housemaid,married,basic.4y,no,no,may,13,261,1,0,nonexistent,93.994,4.857,no
1,1,57,services,married,high.school,unknown,no,may,19,149,1,0,nonexistent,93.994,4.857,no
2,2,37,services,married,high.school,no,yes,may,23,226,1,0,nonexistent,93.994,4.857,no
3,3,40,admin.,married,basic.6y,no,no,may,27,151,1,0,nonexistent,93.994,4.857,no
4,4,56,services,married,high.school,no,no,may,3,307,1,0,nonexistent,93.994,4.857,no


In [4]:
bank_df.dtypes

client_id                       int64
age                             int64
job                            object
marital                        object
education                      object
credit_default                 object
mortgage                       object
month                          object
day                             int64
contact_duration                int64
number_contacts                 int64
previous_campaign_contacts      int64
previous_outcome               object
cons_price_idx                float64
euribor_three_months          float64
campaign_outcome               object
dtype: object

In [7]:
for col in ['credit_default', 'mortgage', 'previous_outcome', 'campaign_outcome']:
    print('---------------------------')
    print(bank_df[col].value_counts())

---------------------------
credit_default
0    41185
1        3
Name: count, dtype: int64
---------------------------
mortgage
1    21576
0    19612
Name: count, dtype: int64
---------------------------
previous_outcome
0    39815
1     1373
Name: count, dtype: int64
---------------------------
campaign_outcome
0    36548
1     4640
Name: count, dtype: int64


In [8]:
# Replacing the columns values
for col in ['credit_default', 'mortgage', 'previous_outcome', 'campaign_outcome']:
    bank_df[col] = bank_df[col].replace({'yes':1, 'unknown':0,'no':0,'success':1,'failure':0,'nonexistent':0}).astype(int)

for col in ['credit_default', 'mortgage', 'previous_outcome', 'campaign_outcome']:
    print('---------------------------')
    print(bank_df[col].value_counts())

---------------------------
credit_default
0    41185
1        3
Name: count, dtype: int64
---------------------------
mortgage
1    21576
0    19612
Name: count, dtype: int64
---------------------------
previous_outcome
0    39815
1     1373
Name: count, dtype: int64
---------------------------
campaign_outcome
0    36548
1     4640
Name: count, dtype: int64


In [9]:
# Converting to bool values
for col in ['credit_default', 'mortgage', 'previous_outcome', 'campaign_outcome']:
    bank_df[col] = bank_df[col].astype(bool)

In [10]:
# Changing the values in job and education column
bank_df['job'] = bank_df['job'].str.replace(".", "_")

bank_df['education'] = bank_df['education'].str.replace(".", "_")
bank_df['education'] = bank_df['education'].replace("unknown", np.nan)

assert 'unknown' not in bank_df['education'].values

In [12]:
# Replacing month column in numbers
bank_df['month'] = bank_df['month'].replace({
    'may':5, 'jul':7, 'aug':8, 'jun':6, 'nov':11, 'apr':4, 'oct':10, 'sep':9, 'mar':3, 'dec':12}).astype(int)

# Creating year column
bank_df['year'] = 2022

# Converting year, month and day column in string
bank_df['month'] = bank_df['month'].astype(str)
bank_df['year'] = bank_df['year'].astype(str)
bank_df['day'] = bank_df['day'].astype(str)

# Creating last contact date column
bank_df['last_contact_date'] = pd.to_datetime(bank_df['year'] + '-' + bank_df['month'] + '-' + bank_df['day'])

In [13]:
client_df = bank_df[['client_id','age','job','marital','education','credit_default','mortgage']]
campaign_df = bank_df[['client_id','number_contacts','contact_duration','previous_campaign_contacts',\
'previous_outcome','campaign_outcome','last_contact_date']]
economics_df = bank_df[['client_id','cons_price_idx','euribor_three_months']]

In [18]:
client = client_df.to_csv("client_df.csv", index=False)
campaign = campaign_df.to_csv("campaign_df.csv", index=False)
economics = economics_df.to_csv("economics.csv", index=False)

In [20]:
# Dataframe of client
client_df.head()

Unnamed: 0,client_id,age,job,marital,education,credit_default,mortgage
0,0,56,housemaid,married,basic_4y,False,False
1,1,57,services,married,high_school,False,False
2,2,37,services,married,high_school,False,True
3,3,40,admin_,married,basic_6y,False,False
4,4,56,services,married,high_school,False,False


In [21]:
# Dataframe of campaign
campaign_df.head()

Unnamed: 0,client_id,number_contacts,contact_duration,previous_campaign_contacts,previous_outcome,campaign_outcome,last_contact_date
0,0,1,261,0,False,False,2022-05-13
1,1,1,149,0,False,False,2022-05-19
2,2,1,226,0,False,False,2022-05-23
3,3,1,151,0,False,False,2022-05-27
4,4,1,307,0,False,False,2022-05-03


In [22]:
# Dataframe of economics
economics_df.head()

Unnamed: 0,client_id,cons_price_idx,euribor_three_months
0,0,93.994,4.857
1,1,93.994,4.857
2,2,93.994,4.857
3,3,93.994,4.857
4,4,93.994,4.857
