In [28]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder,OrdinalEncoder,FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [29]:
df=pd.read_csv('data/bank-full.csv',sep=';')
#df=df.drop(columns=['Unnamed: 0'],axis=1)

In [30]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [36]:
df['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [37]:
df['month'].unique()

array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb',
       'mar', 'apr', 'sep'], dtype=object)

In [31]:
X=df.drop(columns=['y'],axis=1)
y=df['y']

In [32]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [33]:
numeric_columns=['age','balance','day','duration','campaign','pdays','previous']
categorical_columns=['job','marital','default','housing','loan','contact','poutcome']
ordinal_columns=['education','month']

# Define the order of categories for ordinal encoding
education_order = ['primary', 'secondary', 'tertiary']
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

numeric_pipeline=Pipeline(steps=[("Scaler",StandardScaler())])
categorical_pipeline=Pipeline(steps=[("One-Hot Encoder",OneHotEncoder())])
ordinal_pipeline=Pipeline(steps=[("Ordinal Encoding",OrdinalEncoder(categories=[education_order,month_order]))])

preprocessor=ColumnTransformer([("Numeric Pipeline",numeric_pipeline,numeric_columns),
                                ("Categorical Pipeline",categorical_pipeline,categorical_columns),
                                ("Ordinal Encoding COlumns",ordinal_pipeline,ordinal_columns)])

In [34]:
X_train_arr=preprocessor.fit_transform(X_train)
X_test_arr=preprocessor.transform(X_test)

ValueError: Found unknown categories ['unknown'] in column 0 during fit

In [24]:
X_train=pd.DataFrame(X_train_arr)
X_test=pd.DataFrame(X_test_arr)

In [25]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.464799,-0.456185,0.140619,-0.404011,0.389277,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,5.0
1,1.416343,-0.390344,0.380915,0.210292,0.072215,-0.410655,-0.240512,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,10.0
2,0.475772,-0.456185,-1.301157,-0.674459,-0.244847,-0.410655,-0.240512,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,5.0
3,0.005486,0.694704,-1.781749,0.171657,-0.561910,0.781281,1.812098,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,9.0
4,-0.276685,-0.456185,0.501063,-0.647414,-0.561910,-0.410655,-0.240512,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31642,0.287658,-0.100438,0.260767,7.091265,-0.561910,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,5.0
31643,-1.687541,-0.285534,-0.940713,-0.183789,-0.561910,0.513095,0.170010,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,8.0
31644,-0.652913,-0.013768,-0.099677,-0.071746,-0.561910,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0
31645,-0.746970,-0.400757,-1.060861,-0.566280,-0.561910,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0


In [26]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.088571,-0.261347,0.020471,-0.253333,-0.561910,-0.410655,-0.240512,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0
1,0.569829,0.767937,-0.820565,-0.674459,-0.244847,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,5.0
2,-1.499427,-0.275456,0.501063,-0.121972,-0.561910,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0
3,0.099544,0.139415,-0.820565,0.206429,-0.561910,2.936697,0.170010,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,3.0
4,1.416343,-0.383289,0.621211,-0.527645,-0.244847,-0.410655,-0.240512,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13559,0.475772,0.484413,-1.301157,-0.813547,-0.561910,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,5.0
13560,-0.276685,-0.201888,-1.181009,-0.481282,-0.561910,-0.410655,-0.240512,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0
13561,0.005486,1.068592,1.702543,0.156203,0.072215,-0.410655,-0.240512,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,4.0
13562,0.005486,-0.016456,0.501063,-0.886954,0.389277,-0.410655,-0.240512,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,5.0
