In [172]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [173]:
df = pd.read_csv('Data/bank-full.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [174]:
df.month.unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
       'mar', 'apr', 'sep'], dtype=object)

In [175]:
df.default.unique()

array(['no', 'yes'], dtype=object)

In [176]:
df.campaign.unique()

array([ 1,  2,  3,  5,  4,  6,  7,  8,  9, 10, 11, 12, 13, 19, 14, 24, 16,
       32, 18, 22, 15, 17, 25, 21, 43, 51, 63, 41, 26, 28, 55, 50, 38, 23,
       20, 29, 31, 37, 30, 46, 27, 58, 33, 35, 34, 36, 39, 44])

In [177]:
df.poutcome.unique()

array(['unknown', 'failure', 'other', 'success'], dtype=object)

In [178]:
df.contact.unique()

array(['unknown', 'cellular', 'telephone'], dtype=object)

In [179]:
def age_segmenting(x):
    if x>=18 and x<=25:
        return '18-25'
    
    elif x>25 and x<=40:
        return '25-40'
    
    elif x>40 and x<=65:
        return '40-65'
    
    else:
        return '65+'

df['age'] = df['age'].map(age_segmenting)

In [180]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,40-65,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,40-65,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,25-40,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,40-65,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,25-40,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [181]:
def encoding_cat(data, *out):
    """This function converts all the categorical values of the DataFrame "data" into [0,1].
    
       It uses OneHotEncoder for multiple categorical values (>2)
       It uses LabelEncoder for binary categorical values
       
       - data: Pandas DataFrame
       - out: Columns of DataFrame that we don't want to convert
    """
    columns = data.keys()
    
    for col in columns:
        # Sometime not all the columns are need
        if col not in out:
            labels = list(data[col].unique())
            labels.sort()
            
            # Only with categorical values
            if isinstance(labels[0], str):
                
                # OneHotEncoder
                if len(labels)>2:
                    ohe = OneHotEncoder(sparse=False)
                    data[labels]=ohe.fit_transform(data[[col]])
                    data.drop(columns=col, inplace=True)
                
                # LabelEncoder
                else:
                    le = LabelEncoder()
                    data[col]=le.fit_transform(data[col])
    return data

encoding_cat(df, 'month')

Unnamed: 0,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,...,married,single,primary,secondary,tertiary,cellular,telephone,failure,other,success
0,0,2143,1,0,5,may,261,1,-1,0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0,29,1,0,5,may,151,1,-1,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,1,1,5,may,76,1,-1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1506,1,0,5,may,92,1,-1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,0,5,may,198,1,-1,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0,825,0,0,17,nov,977,3,-1,0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
45207,0,1729,0,0,17,nov,456,2,-1,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
45208,0,5715,0,0,17,nov,1127,5,184,3,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
45209,0,668,0,0,17,nov,508,4,-1,0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [182]:
def month_trans(x):
    months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
    values = np.arange(0,12)
    
    converter = {}
    for mon, val in zip(months, values):
        converter[mon] = val
        
    return converter[x]

df['month'] = df['month'].apply(month_trans)



In [183]:
df.month.unique()

array([ 4,  5,  6,  7,  9, 10, 11,  0,  1,  2,  3,  8])

array([0.35567516, 0.43905397, 0.51880673, 0.59427479, 0.72990422,
       0.78894546, 0.84147098, 0.        , 0.09078392, 0.18081808,
       0.26935891, 0.66483486])