# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Data Preprocessing

In [2]:
#Import data set and separate columns at ;
train = pd.read_csv("train.csv", delimiter = ';')
test = pd.read_csv("test.csv", delimiter = ';')

In [3]:
# Concatenate the two dataframes vertically (along rows)
Banking_data = pd.concat([train, test], ignore_index=True)

In [4]:
#Get cloumns names
Banking_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [5]:
#check for null values
Banking_data.isnull().sum().sort_values(ascending=False)

age          0
day          0
poutcome     0
previous     0
pdays        0
campaign     0
duration     0
month        0
contact      0
job          0
loan         0
housing      0
balance      0
default      0
education    0
marital      0
y            0
dtype: int64

In [6]:
# Save the merged dataframe to a new CSV file
Banking_data.to_csv("Banking_data.csv", index=False)

In [7]:
df = pd.read_csv("Banking_data.csv")

In [8]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49727,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
49728,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
49729,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
49730,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [9]:
#Get the shape of the DataFrame
df.shape

(49732, 17)

In [10]:
#Get uniques values
job = df.job.unique()
month = df.month.unique()
marital = df.marital.unique()
education = df.education.unique()
contact = df.contact.unique()
poutcome = df.poutcome.unique()

print(job)
print()
print(month)
print()
print(marital)
print()
print(education)
print()
print(contact)
print()
print(poutcome)

['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']

['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr' 'sep']

['married' 'single' 'divorced']

['tertiary' 'secondary' 'unknown' 'primary']

['unknown' 'cellular' 'telephone']

['unknown' 'failure' 'other' 'success']


In [11]:
#Change prediction column name
df.rename(columns = {'y':'FDcreated'}, inplace=True)

Encoding

In [12]:
#change yes = 1 no = 0
df['default'] = df['default'].map({'yes': 1, 'no':0})
df['housing'] = df['housing'].map({'yes': 1, 'no':0})
df['loan'] = df['loan'].map({'yes': 1, 'no':0})
df['FDcreated'] = df['FDcreated'].map({'yes': 1, 'no':0})

#Give values for each attribute
df['marital'] = df['marital'].map({'married': 1, 'single':2, 'divorced':3})
df['education'] = df['education'].map({'unknown':np.nan, 'tertiary': 1, 'secondary':2,'primary':3})
df['job'] = df['job'].map({'unknown':np.nan ,'admin': 1,'unemployed':2, 'management':3, 'housemaid':4, 'entrepreneur':5, 'student':6, 'blue-collar':7, 'self-employed':8, 'retired':9, 'technician':10, 'services':11})
df['contact'] = df['contact'].map({'unknown': np.nan, 'cellular':1, 'telephone':2})
df['month'] = df['month'].map({'jan': 1, 'feb':2 ,'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12})
df['poutcome'] = df['poutcome'].map({'unknown': np.nan, 'failure':1, 'other':2, 'success':3})


df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,FDcreated
0,58,3.0,1,1.0,0,2143,1,0,,5,5,261,1,-1,0,,0
1,44,10.0,2,2.0,0,29,1,0,,5,5,151,1,-1,0,,0
2,33,5.0,1,2.0,0,2,1,1,,5,5,76,1,-1,0,,0
3,47,7.0,1,,0,1506,1,0,,5,5,92,1,-1,0,,0
4,33,,2,,0,1,0,0,,5,5,198,1,-1,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49727,33,11.0,1,2.0,0,-333,1,0,1.0,30,7,329,5,-1,0,,0
49728,57,8.0,1,1.0,1,-3313,1,1,,9,5,153,1,-1,0,,0
49729,57,10.0,1,2.0,0,295,0,0,1.0,19,8,151,11,-1,0,,0
49730,28,7.0,1,2.0,0,1137,0,0,1.0,6,2,129,4,211,3,2.0,0


Normalization

In [13]:
#Balance
minB = df['balance'].min()
maxB = df['balance'].max()

print("Minimum balance: ", minB)
print("Maximum balance: ", maxB)

df['balance'] = df['balance'].apply(lambda v: (v - df['balance'].min())/ (df['balance'].max() - df['balance'].min()))

df

Minimum balance:  -8019
Maximum balance:  102127


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,FDcreated
0,58,3.0,1,1.0,0,0.092259,1,0,,5,5,261,1,-1,0,,0
1,44,10.0,2,2.0,0,0.073067,1,0,,5,5,151,1,-1,0,,0
2,33,5.0,1,2.0,0,0.072822,1,1,,5,5,76,1,-1,0,,0
3,47,7.0,1,,0,0.086476,1,0,,5,5,92,1,-1,0,,0
4,33,,2,,0,0.072812,0,0,,5,5,198,1,-1,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49727,33,11.0,1,2.0,0,0.069780,1,0,1.0,30,7,329,5,-1,0,,0
49728,57,8.0,1,1.0,1,0.042725,1,1,,9,5,153,1,-1,0,,0
49729,57,10.0,1,2.0,0,0.075482,0,0,1.0,19,8,151,11,-1,0,,0
49730,28,7.0,1,2.0,0,0.083126,0,0,1.0,6,2,129,4,211,3,2.0,0


In [14]:
#pdays
minPD = df['pdays'].min()
maxPD = df['pdays'].max()

print("Minimum pdays: ", minPD)
print("Maximum pdays: ", maxPD)

df['pdays'] = df['pdays'].apply(lambda v: (v - df['pdays'].min())/ (df['pdays'].max() - df['pdays'].min()))

df

Minimum pdays:  -1
Maximum pdays:  871


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,FDcreated
0,58,3.0,1,1.0,0,0.092259,1,0,,5,5,261,1,0.000000,0,,0
1,44,10.0,2,2.0,0,0.073067,1,0,,5,5,151,1,0.000000,0,,0
2,33,5.0,1,2.0,0,0.072822,1,1,,5,5,76,1,0.000000,0,,0
3,47,7.0,1,,0,0.086476,1,0,,5,5,92,1,0.000000,0,,0
4,33,,2,,0,0.072812,0,0,,5,5,198,1,0.000000,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49727,33,11.0,1,2.0,0,0.069780,1,0,1.0,30,7,329,5,0.000000,0,,0
49728,57,8.0,1,1.0,1,0.042725,1,1,,9,5,153,1,0.000000,0,,0
49729,57,10.0,1,2.0,0,0.075482,0,0,1.0,19,8,151,11,0.000000,0,,0
49730,28,7.0,1,2.0,0,0.083126,0,0,1.0,6,2,129,4,0.243119,3,2.0,0
