In [2]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
DATA_PATH = "../data/bank_train.csv"

df = pd.read_csv(DATA_PATH)
df.head(5)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [9]:
import pickle
from sklearn.preprocessing import OneHotEncoder


class Config:
    categorical_features = ["Geography", "Gender"]
    encoder_path = "../artifacts/encoders/onehotencoder.pkl"

# Data preprocessing
class Preprocessor:
    
    def __init__(self, data: pd.DataFrame, inference_mode: bool):
        self.data = data
        self.inference_mode = inference_mode

    def __call__(self):
        encoder = self.get_encoder()
        self.encode_data(encoder=encoder)
        return self.data

    def get_encoder(self):
        if self.inference_mode:
            encoder = self._load_encoder()
        else:
            data_to_encode = self.data[Config.categorical_features]
            encoder = OneHotEncoder(handle_unknown="infrequent_if_exist")
            encoder.fit(data_to_encode)
            self._save_encoder(encoder=encoder)
        return encoder
    
    def encode_data(self, encoder):
        data_to_encode = self.data[Config.categorical_features]
        encoded_data = encoder.transform(data_to_encode)
        self.data[Config.categorical_features] = encoded_data

    def _load_encoder(self):
        with open(Config.encoder_path, 'r') as encoder_file:
            return pickle.load(encoder_file)
    
    def _save_encoder(self, encoder):
        with open(Config.encoder_path, 'w') as encoder_file:
            pickle.dump(encoder, encoder_file)


In [10]:
processor = Preprocessor(data=df, inference_mode=False)
processor()

FileNotFoundError: [Errno 2] No such file or directory: '../artifacts/encoders/onehotencoder.pkl'

In [18]:
train_df = df[["CreditScore", "Geography", "Gender", "Age", "Tenure", "Balance", "NumOfProducts", "HasCrCard", "IsActiveMember", "EstimatedSalary"]]
labels = df["Exited"]

x_train, x_test, y_train, y_test = train_test_split(train_df, labels, test_size=0.2)

x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)


clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y=y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.95      0.91     25906
           1       0.73      0.53      0.61      7101

    accuracy                           0.86     33007
   macro avg       0.80      0.74      0.76     33007
weighted avg       0.85      0.86      0.85     33007



: 

112022    0
4827      0
34392     0
39134     0
113294    1
         ..
136384    0
64890     0
56115     0
50217     0
61001     0
Name: Exited, Length: 132027, dtype: int64