<a href="https://colab.research.google.com/github/Szinacska/CI-CD-Project/blob/main/Pickle_model_creation_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pickle

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Churn_Banking_Modeling_ENG.csv")

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 377369 entries, 0 to 377368
Data columns (total 43 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   customer_id                     377369 non-null  int64  
 1   flag_request_closure            377369 non-null  object 
 2   amt_cust_value                  377369 non-null  float64
 3   flag_online_acc_opening         377369 non-null  int64  
 4   flag_mult_account_ownership     377369 non-null  int64  
 5   num_age                         377362 non-null  float64
 6   str_prov_domicile               356181 non-null  object 
 7   str_prov_residence              357897 non-null  object 
 8   num_year_first_account          377369 non-null  float64
 9   str_profession                  341172 non-null  object 
 10  amt_income                      157476 non-null  object 
 11  str_cust_type                   377369 non-null  object 
 12  str_gender      

In [5]:
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numeric NaNs with median
for col in df.select_dtypes(include=['object']).columns:  # Fill categorical NaNs with mode
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [16]:
# Encode categorical variables
label_enc_cols = ['flag_request_closure']  # Target variable (yes/no)
df[label_enc_cols] = df[label_enc_cols].apply(lambda x: x.map({'yes': 1, 'no': 0}))

# Get a list of all object (string) columns except the target
# Modified to include all columns before encoding
all_cols = df.select_dtypes(include=['object']).columns.tolist()
# Now remove 'flag_request_closure'
if 'flag_request_closure' in all_cols:
    all_cols.remove('flag_request_closure')
# Proceed with one-hot encoding on the remaining columns
one_hot_cols = all_cols  # Now one_hot_cols contains the desired columns

# Apply one-hot encoding to all object (string) columns
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

In [17]:
# Define features (X) and target (y)
X = df.drop(columns=['flag_request_closure'])  # Features
y = df['flag_request_closure']  # Target variable

In [18]:
# Split into training and test sets (80% train, 20% test)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Check if 'flag_request_closure' is in categorical_cols before removing
if 'flag_request_closure' in categorical_cols:
    categorical_cols.remove('flag_request_closure')

In [23]:
# Impute missing values in the target variable before splitting
# Using the mode (most frequent value) for categorical target
# Check if mode is not empty before filling

# Check if train_y contains only NaNs, if so, fill with a default value
if train_y.isnull().all():
    train_y.fillna(0, inplace=True)  # Or any other suitable default value
# Otherwise, fill NaNs with the mode if the mode exists
elif train_y.isnull().any() and not train_y.mode().empty:
    train_y.fillna(train_y.mode()[0], inplace=True)

# Perform the same check and imputation for test_y
if test_y.isnull().all():
    test_y.fillna(0, inplace=True)  # Or any other suitable default value
elif test_y.isnull().any() and not test_y.mode().empty:
    test_y.fillna(test_y.mode()[0], inplace=True)

# Continue with training the model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(train_X, train_y)

In [27]:
import os

model_path = "/content/drive/MyDrive/models/churn_rf_model.pickle"
from google.colab import drive
drive.mount('/content/drive')

# Create the 'models' directory if it doesn't exist
os.makedirs(os.path.dirname(model_path), exist_ok=True)

with open(model_path, 'wb') as model_file:
    pickle.dump(rf_model, model_file)

print(f"Model saved successfully at: {model_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved successfully at: /content/drive/MyDrive/models/churn_rf_model.pickle
