In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, Normalizer, StandardScaler, MinMaxScaler, power_transform
import scipy.stats as stats

In [3]:
bank_data_df = pd.read_csv('datasource/bank_data_df_changed_time.csv')
bank_data_df = bank_data_df.iloc[:,1:]

In [4]:
bank_data_df['duration'] = power_transform(bank_data_df[['duration']], method='yeo-johnson')
bank_data_df['age'] = power_transform(bank_data_df[['age']], method='box-cox') 
bank_data_df['age'] += abs(np.min(bank_data_df['age']))
bank_data_df['balance'] = np.log1p(bank_data_df['balance'] + abs(min(bank_data_df['balance'])))

In [5]:
bank_data_df.describe()

Unnamed: 0,age,balance,duration,campaign,pdays,previous,y,day_of_the_year
count,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0
mean,3.135373,8.365862,1.021798e-16,2.795354,40.218473,0.545907,0.115321,174.342091
std,1.000138,0.399632,1.000138,3.144846,100.170192,1.657239,0.319453,74.662442
min,0.0,0.0,-3.104636,1.0,-1.0,0.0,0.0,7.0
25%,2.39753,8.125335,-0.6202218,1.0,-1.0,0.0,0.0,133.0
50%,3.078535,8.23337,-0.02187516,2.0,-1.0,0.0,0.0,157.5
75%,3.897703,8.474651,0.63238,3.0,-1.0,0.0,0.0,219.25
max,6.085723,10.722364,3.704733,50.0,871.0,24.0,1.0,366.0


In [6]:
bank_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3616 entries, 0 to 3615
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              3616 non-null   float64
 1   job              3616 non-null   object 
 2   marital          3616 non-null   object 
 3   education        3616 non-null   object 
 4   default          3616 non-null   object 
 5   balance          3616 non-null   float64
 6   housing          3616 non-null   object 
 7   loan             3616 non-null   object 
 8   contact          3616 non-null   object 
 9   duration         3616 non-null   float64
 10  campaign         3616 non-null   int64  
 11  pdays            3616 non-null   int64  
 12  previous         3616 non-null   int64  
 13  poutcome         3616 non-null   object 
 14  y                3616 non-null   int64  
 15  day_of_the_year  3616 non-null   int64  
dtypes: float64(3), int64(5), object(8)
memory usage: 452.1+ KB


In [7]:
from sklearn.preprocessing import OrdinalEncoder

numeric_values = bank_data_df.select_dtypes(include=np.number).columns.values
oe = OrdinalEncoder()
X = oe.fit_transform(bank_data_df.drop(columns=numeric_values),bank_data_df["y"])

In [8]:
oe.get_feature_names_out()

array(['job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'poutcome'], dtype=object)

In [9]:
bank_data_df.loc[:, oe.get_feature_names_out()] = X

In [10]:
bank_data_df = bank_data_df.convert_dtypes()

In [11]:
bank_data_df.describe()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y,day_of_the_year
count,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0,3616.0
mean,3.135373,4.404867,1.1474,1.228706,0.019082,8.365862,0.568308,0.154591,0.644358,0.0,2.795354,40.218473,0.545907,2.545631,0.115321,174.342091
std,1.000138,3.272048,0.602997,0.750078,0.136832,0.399632,0.495381,0.361564,0.900076,1.000138,3.144846,100.170192,1.657239,1.005169,0.319453,74.662442
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.104636,1.0,-1.0,0.0,0.0,0.0,7.0
25%,2.39753,1.0,1.0,1.0,0.0,8.125335,0.0,0.0,0.0,-0.620222,1.0,-1.0,0.0,3.0,0.0,133.0
50%,3.078535,4.0,1.0,1.0,0.0,8.23337,1.0,0.0,0.0,-0.021875,2.0,-1.0,0.0,3.0,0.0,157.5
75%,3.897703,7.0,2.0,2.0,0.0,8.474651,1.0,0.0,2.0,0.63238,3.0,-1.0,0.0,3.0,0.0,219.25
max,6.085723,11.0,2.0,3.0,1.0,10.722364,1.0,1.0,2.0,3.704733,50.0,871.0,24.0,3.0,1.0,366.0


In [12]:
bank_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3616 entries, 0 to 3615
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              3616 non-null   Float64
 1   job              3616 non-null   Int64  
 2   marital          3616 non-null   Int64  
 3   education        3616 non-null   Int64  
 4   default          3616 non-null   Int64  
 5   balance          3616 non-null   Float64
 6   housing          3616 non-null   Int64  
 7   loan             3616 non-null   Int64  
 8   contact          3616 non-null   Int64  
 9   duration         3616 non-null   Float64
 10  campaign         3616 non-null   Int64  
 11  pdays            3616 non-null   Int64  
 12  previous         3616 non-null   Int64  
 13  poutcome         3616 non-null   Int64  
 14  y                3616 non-null   Int64  
 15  day_of_the_year  3616 non-null   Int64  
dtypes: Float64(3), Int64(13)
memory usage: 508.6 KB


In [13]:
bank_data_df.to_csv("datasource/data_after_another_fe.csv", index = False)