In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('d:/BankCustomer/data/bank.csv')

In [3]:
df = data.copy()

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,59,56,41,55,54
job,Farmers,Farmers,technician,services,Farmers
marital,married,married,married,married,married
education,secondary,secondary,secondary,secondary,tertiary
default,no,no,no,no,no
balance,234300,4500,127000,247600,18400
Yearly income,234567.0,307600.0,500000.0,234000.0,327600.0
Number of Children,0,4,0,3,1
housing,yes,no,yes,yes,no
loan,no,no,no,no,no


In [5]:
df.tail().T

Unnamed: 0,11155,11156,11157,11158,11159
age,35,34,33,39,32
job,blue-collar,blue-collar,blue-collar,services,technician
marital,married,single,single,married,single
education,secondary,secondary,primary,secondary,secondary
default,no,no,no,no,no
balance,8000,-720,100,73300,2900
Yearly income,89840.0,8085.6,1123.0,82315.0,32567.0
Number of Children,2,0,0,0,0
housing,yes,yes,yes,no,no
loan,yes,no,no,no,no


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int64  
 1   job                 11160 non-null  object 
 2   marital             11160 non-null  object 
 3   education           11160 non-null  object 
 4   default             11160 non-null  object 
 5   balance             11160 non-null  int64  
 6   Yearly income       11160 non-null  float64
 7   Number of Children  11160 non-null  int64  
 8   housing             11160 non-null  object 
 9   loan                11160 non-null  object 
 10  contact             11160 non-null  object 
 11  day                 11160 non-null  int64  
 12  month               11160 non-null  object 
 13  duration            11160 non-null  int64  
 14  campaign            11160 non-null  int64  
 15  pdays               11160 non-null  int64  
 16  prev

In [7]:
def clean_data(df):
    ### reformat columns
    df.columns = [col.lower().strip().replace(' ', '_') for col in df.columns]
    
    ### optimize for memory
    new_types = {
        'age': 'int32', 'balance':'int32', 
        'yearly_income':'float32', 'number_of_children':'int32',
        'duration': 'int32', 'day':'int32', 'campaign':'int32', 'pdays':'int32',
        'previous': 'int32'
    }
    for col, typp in new_types.items():
        df[col] = df[col].astype(typp)


<center><b>Cleaning</b></center>

In [8]:
# check missing values
# no outliers

In [9]:
### rename
[col.lower().strip().replace(' ', '_') for col in df.columns]  

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'yearly_income',
 'number_of_children',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'deposit']

In [10]:
df.columns = [col.lower().strip().replace(' ', '_') for col in df.columns]

In [11]:
### coversion
new_types = {
    'age': 'int32', 'balance':'int32', 
    'yearly_income':'float32', 'number_of_children':'int32',
    'duration': 'int32', 'day':'int32', 'campaign':'int32', 'pdays':'int32',
    'previous': 'int32'
}

for col, typp in new_types.items():
    df[col] = df[col].astype(typp)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int32  
 1   job                 11160 non-null  object 
 2   marital             11160 non-null  object 
 3   education           11160 non-null  object 
 4   default             11160 non-null  object 
 5   balance             11160 non-null  int32  
 6   yearly_income       11160 non-null  float32
 7   number_of_children  11160 non-null  int32  
 8   housing             11160 non-null  object 
 9   loan                11160 non-null  object 
 10  contact             11160 non-null  object 
 11  day                 11160 non-null  int32  
 12  month               11160 non-null  object 
 13  duration            11160 non-null  int32  
 14  campaign            11160 non-null  int32  
 15  pdays               11160 non-null  int32  
 16  prev

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int64  
 1   job                 11160 non-null  object 
 2   marital             11160 non-null  object 
 3   education           11160 non-null  object 
 4   default             11160 non-null  object 
 5   balance             11160 non-null  int64  
 6   Yearly income       11160 non-null  float64
 7   Number of Children  11160 non-null  int64  
 8   housing             11160 non-null  object 
 9   loan                11160 non-null  object 
 10  contact             11160 non-null  object 
 11  day                 11160 non-null  int64  
 12  month               11160 non-null  object 
 13  duration            11160 non-null  int64  
 14  campaign            11160 non-null  int64  
 15  pdays               11160 non-null  int64  
 16  prev

In [14]:
clean_data(data)

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int32  
 1   job                 11160 non-null  object 
 2   marital             11160 non-null  object 
 3   education           11160 non-null  object 
 4   default             11160 non-null  object 
 5   balance             11160 non-null  int32  
 6   yearly_income       11160 non-null  float32
 7   number_of_children  11160 non-null  int32  
 8   housing             11160 non-null  object 
 9   loan                11160 non-null  object 
 10  contact             11160 non-null  object 
 11  day                 11160 non-null  int32  
 12  month               11160 non-null  object 
 13  duration            11160 non-null  int32  
 14  campaign            11160 non-null  int32  
 15  pdays               11160 non-null  int32  
 16  prev

In [16]:
data.to_parquet('d:/BankCustomer/data/bank_v2.parquet', index=False)