In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder

In [3]:
data = pd.read_parquet('d:/BankCustomer/data/bank_v2.parquet')
df = data.copy()

In [4]:
df.select_dtypes('object').head().T

Unnamed: 0,0,1,2,3,4
job,Farmers,Farmers,technician,services,Farmers
marital,married,married,married,married,married
education,secondary,secondary,secondary,secondary,tertiary
default,no,no,no,no,no
housing,yes,no,yes,yes,no
loan,no,no,no,no,no
contact,unknown,unknown,unknown,unknown,unknown
month,may,may,may,may,may
poutcome,unknown,unknown,unknown,unknown,unknown
deposit,yes,yes,yes,yes,yes


In [5]:
df.job.value_counts()

job
management       2566
blue-collar      1944
technician       1821
Farmers          1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: count, dtype: int64

In [6]:
def encode_categorical_features(df):
    ### nominal encoding -
    nominal_cols = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome', 'deposit']
    for col in nominal_cols:
        encoder = LabelEncoder().fit(df[col])
        df[col] = encoder.transform(df[col])
        encoder[col] = encoder
    
    ### to_int32
    df['job'] = df['job'].astype('int32')
    
    ### hash based ordinal encoding
    hash_edu = {'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3}
    df['education'] = df['education'].apply(lambda edu: hash_edu[edu]).astype('int32')
    
    ### drop month
    df.drop(columns=['month'], inplace=True) 

<center><b>Categorical Encoding</b></center>

In [7]:
encoder = LabelEncoder().fit(df['job'])
encoder.transform(df['job'])

array([0, 0, 9, ..., 1, 7, 9])

In [8]:
encoder.inverse_transform(encoder.transform(df['job']))

array(['Farmers', 'Farmers', 'technician', ..., 'blue-collar', 'services',
       'technician'], dtype=object)

In [9]:
encoders = {}

In [10]:
### nominal encoding -
nominal_cols = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome', 'deposit']
for col in nominal_cols:
    encoder = LabelEncoder().fit(df[col])
    df[col] = encoder.transform(df[col])
    encoders[col] = encoder

In [11]:
encoders

{'job': LabelEncoder(),
 'marital': LabelEncoder(),
 'default': LabelEncoder(),
 'housing': LabelEncoder(),
 'loan': LabelEncoder(),
 'contact': LabelEncoder(),
 'poutcome': LabelEncoder(),
 'deposit': LabelEncoder()}

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int32  
 1   job                 11160 non-null  int32  
 2   marital             11160 non-null  int32  
 3   education           11160 non-null  object 
 4   default             11160 non-null  int32  
 5   balance             11160 non-null  int32  
 6   yearly_income       11160 non-null  float32
 7   number_of_children  11160 non-null  int32  
 8   housing             11160 non-null  int32  
 9   loan                11160 non-null  int32  
 10  contact             11160 non-null  int32  
 11  day                 11160 non-null  int32  
 12  month               11160 non-null  object 
 13  duration            11160 non-null  int32  
 14  campaign            11160 non-null  int32  
 15  pdays               11160 non-null  int32  
 16  prev

In [13]:
df.education.value_counts()

education
secondary    5474
tertiary     3689
primary      1500
unknown       497
Name: count, dtype: int64

In [14]:
### ordinal encoding
hash_edu = {'unknown':0, 'primary':1, 'secondary':2, 'tertiary':3}
df['education'].apply(lambda edu: hash_edu[edu])

0        2
1        2
2        2
3        2
4        3
        ..
11155    2
11156    2
11157    1
11158    2
11159    2
Name: education, Length: 11160, dtype: int64

In [15]:
encode_categorical_features(data)

In [16]:
data.head().T

Unnamed: 0,0,1,2,3,4
age,59.0,56.0,41.0,55.0,54.0
job,0.0,0.0,9.0,7.0,0.0
marital,1.0,1.0,1.0,1.0,1.0
education,2.0,2.0,2.0,2.0,3.0
default,0.0,0.0,0.0,0.0,0.0
balance,234300.0,4500.0,127000.0,247600.0,18400.0
yearly_income,234567.0,307600.0,500000.0,234000.0,327600.0
number_of_children,0.0,4.0,0.0,3.0,1.0
housing,1.0,0.0,1.0,1.0,0.0
loan,0.0,0.0,0.0,0.0,0.0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int32  
 1   job                 11160 non-null  int32  
 2   marital             11160 non-null  int32  
 3   education           11160 non-null  int32  
 4   default             11160 non-null  int32  
 5   balance             11160 non-null  int32  
 6   yearly_income       11160 non-null  float32
 7   number_of_children  11160 non-null  int32  
 8   housing             11160 non-null  int32  
 9   loan                11160 non-null  int32  
 10  contact             11160 non-null  int32  
 11  day                 11160 non-null  int32  
 12  duration            11160 non-null  int32  
 13  campaign            11160 non-null  int32  
 14  pdays               11160 non-null  int32  
 15  previous            11160 non-null  int32  
 16  pout

In [18]:
data.to_parquet('d:/BankCustomer/data/bank_v3.parquet', index=False)