# Decision Tree model training and saving to export it into odoo

In [2]:
from ipyfilechooser import FileChooser

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

## Loading the data

In [3]:
# Data can be downloaded from https://www.kaggle.com/datasets/shrutimechlearn/churn-modelling/data
# To ensure correct working along with Odoo country names should be modified to ES, DE, FR
fc = FileChooser()
display(fc)

FileChooser(path='/home/luis/Desarrollo/docencia/SIEA/odoo_addons_tutorial/ml_test/dt_training', filename='', …

In [20]:
file_path = fc.selected#"Churn_Modelling.csv"
data = pd.read_csv(file_path).drop(columns=["RowNumber", "CustomerId", "Surname"])
data["Gender"] = data["Gender"].str.lower()
X = data.drop(columns=["Exited"]).to_numpy()
y = data["Exited"].to_numpy()
data


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,FR,female,42,2,0.00,1,1,1,101348.88,1
1,608,ES,female,41,1,83807.86,1,0,1,112542.58,0
2,502,FR,female,42,8,159660.80,3,1,0,113931.57,1
3,699,FR,female,39,1,0.00,2,0,0,93826.63,0
4,850,ES,female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,FR,male,39,5,0.00,2,1,0,96270.64,0
9996,516,FR,male,35,10,57369.61,1,1,1,101699.77,0
9997,709,FR,female,36,7,0.00,1,0,1,42085.58,1
9998,772,DE,male,42,3,75075.31,2,1,0,92888.52,1


## Preparing the pipeline

In [22]:
geography_e = OneHotEncoder()
gender_e = OneHotEncoder()
clf = DecisionTreeClassifier(min_samples_leaf=50)
pass_features = ["CreditScore", "Age", "Tenure", "Balance",
                 "NumOfProducts", "HasCrCard", "IsActiveMember",
                 "EstimatedSalary"]
pass_indices = [data.columns.get_loc(feature) for feature in pass_features]
column_trans = ColumnTransformer([("Geography Encoder", geography_e, [data.columns.get_loc("Geography")]),
                                  ("Gender Encoder", gender_e, [data.columns.get_loc("Gender")]),
                                  ("Pass the others", "passthrough", pass_indices)])

pipe = Pipeline([("Preprocessing", column_trans), ("clf", clf)])

## Evaluating the model

In [23]:
scores = cross_val_score(pipe, X, y)
scores, np.mean(scores), np.std(scores)

(array([0.8485, 0.857 , 0.843 , 0.8545, 0.8455]), 0.8497, 0.005297169055259615)

## Training the model with all the data and saving it into a file

In [24]:
pipe.fit(X, y)
joblib.dump(pipe, "client_exit_classifier.pkl")

['client_exit_classifier.pkl']