In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from feature_engine.creation import MathFeatures, RelativeFeatures, CyclicalFeatures

In [3]:
data = pd.read_parquet('d:/BankCustomer/data/bank_v3.parquet')
df = data.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 11160 non-null  int32  
 1   job                 11160 non-null  int32  
 2   marital             11160 non-null  int32  
 3   education           11160 non-null  int32  
 4   default             11160 non-null  int32  
 5   balance             11160 non-null  int32  
 6   yearly_income       11160 non-null  float32
 7   number_of_children  11160 non-null  int32  
 8   housing             11160 non-null  int32  
 9   loan                11160 non-null  int32  
 10  contact             11160 non-null  int32  
 11  day                 11160 non-null  int32  
 12  duration            11160 non-null  int32  
 13  campaign            11160 non-null  int32  
 14  pdays               11160 non-null  int32  
 15  previous            11160 non-null  int32  
 16  pout

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,59.0,56.0,41.0,55.0,54.0
job,0.0,0.0,9.0,7.0,0.0
marital,1.0,1.0,1.0,1.0,1.0
education,2.0,2.0,2.0,2.0,3.0
default,0.0,0.0,0.0,0.0,0.0
balance,234300.0,4500.0,127000.0,247600.0,18400.0
yearly_income,234567.0,307600.0,500000.0,234000.0,327600.0
number_of_children,0.0,4.0,0.0,3.0,1.0
housing,1.0,0.0,1.0,1.0,0.0
loan,0.0,0.0,0.0,0.0,0.0


<center><h1>Feature Engineering</h1></center>

In [6]:
def feature_engineering(df: pd.DataFrame) -> bool:
    try:
        df['number_of_children'] = df['number_of_children'] + 1
        relativeFeatures = RelativeFeatures(
            variables=['yearly_income', 'balance'], reference=['number_of_children'], func=['div'], 
            fill_value=None, missing_values='ignore', drop_original=False
        )
        features = relativeFeatures.fit_transform(df)
        df['number_of_children'] = df['number_of_children'] - 1
        
        ## add new features
        for feature in list(features.columns)[18:]:
            df[feature] = features[feature]
        
        
        ## add new feature - yearly_income * age
        df['income_mul_age'] = df['yearly_income'] * df['age']
        
        return True
    except Exception:
        return False

In [7]:
df['number_of_children'] = df['number_of_children'] + 1

In [8]:
relativeFeature = RelativeFeatures(
    variables=['yearly_income', 'balance'], reference=['number_of_children'], func=['div'], 
    fill_value=None, missing_values='ignore', drop_original=False
)

In [9]:
relativeFeature.fit_transform(df).head()

Unnamed: 0,age,job,marital,education,default,balance,yearly_income,number_of_children,housing,loan,contact,day,duration,campaign,pdays,previous,poutcome,deposit,yearly_income_div_number_of_children,balance_div_number_of_children
0,59,0,1,2,0,234300,234567.0,1,1,0,2,5,1042,1,-1,0,3,1,234567.0,234300.0
1,56,0,1,2,0,4500,307600.0,5,0,0,2,5,1467,1,-1,0,3,1,61520.0,900.0
2,41,9,1,2,0,127000,500000.0,1,1,0,2,5,1389,1,-1,0,3,1,500000.0,127000.0
3,55,7,1,2,0,247600,234000.0,4,1,0,2,5,579,1,-1,0,3,1,58500.0,61900.0
4,54,0,1,3,0,18400,327600.0,2,0,0,2,5,673,2,-1,0,3,1,163800.0,9200.0


In [10]:
df['number_of_children'] = df['number_of_children'] - 1

In [11]:
feature_engineering(data)

True

In [12]:
data.head().T

Unnamed: 0,0,1,2,3,4
age,59.0,56.0,41.0,55.0,54.0
job,0.0,0.0,9.0,7.0,0.0
marital,1.0,1.0,1.0,1.0,1.0
education,2.0,2.0,2.0,2.0,3.0
default,0.0,0.0,0.0,0.0,0.0
balance,234300.0,4500.0,127000.0,247600.0,18400.0
yearly_income,234567.0,307600.0,500000.0,234000.0,327600.0
number_of_children,0.0,4.0,0.0,3.0,1.0
housing,1.0,0.0,1.0,1.0,0.0
loan,0.0,0.0,0.0,0.0,0.0


In [13]:
data.to_parquet('d:/BankCustomer/data/bank_v4.parquet', index=False)