### Bibliotheque 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

### Visualisation

In [2]:
data = pd.read_csv('churn-bigml-80.csv')

In [3]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
data.describe()

Unnamed: 0,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
count,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0,2666.0
mean,100.620405,437.43886,8.021755,179.48162,100.310203,30.512404,200.386159,100.023631,17.033072,201.168942,100.106152,9.052689,10.237022,4.467367,2.76449,1.562641
std,39.563974,42.521018,13.612277,54.21035,19.988162,9.215733,50.951515,20.161445,4.330864,50.780323,19.418459,2.28512,2.788349,2.456195,0.752812,1.311236
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.7,33.0,1.97,0.0,0.0,0.0,0.0
25%,73.0,408.0,0.0,143.4,87.0,24.38,165.3,87.0,14.05,166.925,87.0,7.5125,8.5,3.0,2.3,1.0
50%,100.0,415.0,0.0,179.95,101.0,30.59,200.9,100.0,17.08,201.15,100.0,9.05,10.2,4.0,2.75,1.0
75%,127.0,510.0,19.0,215.9,114.0,36.7,235.1,114.0,19.98,236.475,113.0,10.64,12.1,6.0,3.27,2.0
max,243.0,510.0,50.0,350.8,160.0,59.64,363.7,170.0,30.91,395.0,166.0,17.77,20.0,20.0,5.4,9.0


In [5]:
df = data.copy()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [7]:
df.shape

(2666, 20)

In [8]:
df.dtypes.value_counts()

int64      8
float64    8
object     3
bool       1
Name: count, dtype: int64

In [9]:
df.duplicated().sum()

0

###  Preprocessing

In [19]:
label_encoder = LabelEncoder()

In [20]:
label_encoder = LabelEncoder()
df['International plan'] = label_encoder.fit_transform(df['International plan'])
df['Voice mail plan'] = label_encoder.fit_transform(df['Voice mail plan'])
df['State'] = label_encoder.fit_transform(df['State'])
df['Churn'] = df['Churn'].astype(int)
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,16,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,35,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,31,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,35,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,36,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


### Train

In [21]:
# Préparation des données pour le modèle
X = df.drop(columns=['Churn','State'])
y = df['Churn']

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y,  train_size = 0.8, random_state = 42)

### Random forest 

In [14]:
k_values = [3, 5, 7, 10]

param_grid = {
    'min_samples_split': [2, 10, 30, 50, 100, 200, 300, 700],
    'max_depth': [2, 4, 8, 16, 32, 64, None],
    'n_estimators': [10, 50, 100, 500]
}

best_models = {}

for cv in k_values:
    print(f'CV = {cv}')
    
    grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                               param_grid=param_grid,
                               cv=cv,
                               n_jobs=-1,
                               verbose=2)
    
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    best_models[cv] = best_model
    
    print(f'Meilleurs paramètres : {grid_search.best_params_}')
    print(f'Meilleur score : {grid_search.best_score_}')


CV = 3
Fitting 3 folds for each of 224 candidates, totalling 672 fits
Meilleurs paramètres : {'max_depth': 16, 'min_samples_split': 10, 'n_estimators': 500}
Meilleur score : 0.9512172896733424
CV = 5
Fitting 5 folds for each of 224 candidates, totalling 1120 fits
Meilleurs paramètres : {'max_depth': 16, 'min_samples_split': 2, 'n_estimators': 50}
Meilleur score : 0.954501874635793
CV = 7
Fitting 7 folds for each of 224 candidates, totalling 1568 fits
Meilleurs paramètres : {'max_depth': 16, 'min_samples_split': 10, 'n_estimators': 500}
Meilleur score : 0.9526331196844572
CV = 10
Fitting 10 folds for each of 224 candidates, totalling 2240 fits
Meilleurs paramètres : {'max_depth': 16, 'min_samples_split': 2, 'n_estimators': 100}
Meilleur score : 0.9554495195471897


In [15]:
random_model = best_models[10]

In [16]:
y_pred = random_model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Précision sur l\'ensemble de validation : {accuracy}')

Précision sur l'ensemble de validation : 0.951310861423221


In [24]:
from sklearn.metrics import classification_report

for cv, best_model in best_models.items():
    y_pred = best_model.predict(X_val)
    print(f'Classification Report for CV = {cv}:')
    print(classification_report(y_val, y_pred))
    print('----------------------------------------')

Classification Report for CV = 3:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       455
           1       0.98      0.67      0.80        79

    accuracy                           0.95       534
   macro avg       0.96      0.83      0.88       534
weighted avg       0.95      0.95      0.95       534

----------------------------------------
Classification Report for CV = 5:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       455
           1       0.96      0.70      0.81        79

    accuracy                           0.95       534
   macro avg       0.96      0.85      0.89       534
weighted avg       0.95      0.95      0.95       534

----------------------------------------
Classification Report for CV = 7:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       455
           1       0.98      0.67      0.80        79

In [15]:
import pickle
# pickle.dump(random_model, open('random_model.pkl', 'wb'))
# pickle.dump(best_models[3], open('random_model_cv_3.pkl', 'wb'))
# pickle.dump(best_models[5], open('random_model_cv_5.pkl', 'wb'))
# pickle.dump(best_models[7], open('random_model_cv_7.pkl', 'wb'))

In [16]:
model = pickle.load(open('random_model.pkl', 'rb'))

In [17]:
y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Précision sur l\'ensemble de validation : {accuracy}')

Précision sur l'ensemble de validation : 0.951310861423221


### SVM 

In [18]:
from sklearn.svm import SVC

param_grid = {'C': [0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'sigmoid'],
              'gamma': ['scale', 'auto']}

svm = SVC(random_state=42)

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_svm = grid_search.best_estimator_

print(f'Meilleurs paramètres : {grid_search.best_params_}')
print(f'Meilleur score : {grid_search.best_score_}')


Meilleurs paramètres : {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Meilleur score : 0.8602214379171202


In [19]:
y_pred = best_svm.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f'Précision sur l\'ensemble de validation : {accuracy}')

Précision sur l'ensemble de validation : 0.8689138576779026


In [26]:
from sklearn.metrics import classification_report
print(f'Classification Report for CV = {5}:')
print(classification_report(y_val, y_pred))
print('----------------------------------------')

Classification Report for CV = 5:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       455
           1       0.70      0.20      0.31        79

    accuracy                           0.87       534
   macro avg       0.79      0.59      0.62       534
weighted avg       0.85      0.87      0.84       534

----------------------------------------


In [20]:
from joblib import dump, load

dump(best_svm, 'svm_model.joblib')

['svm_model.joblib']

In [42]:
model_load = load('svm_model.joblib')

In [43]:
y_pred = model_load.predict(X_text)

accuracy = accuracy_score(y_test, y_pred)
print(f'Précision sur l\'ensemble de validation : {accuracy}')

NameError: name 'X_text' is not defined

### Gradient boosting

In [27]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10, 30],
    'min_samples_leaf': [1, 5, 10, 30]
}

gb = GradientBoostingClassifier()

grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

gb_best = grid_search.best_estimator_


Best Parameters:  {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best Score:  0.9596661938846192


In [11]:
from joblib import dump, load
# dump(gb_best, 'gb_model.joblib')

In [13]:
model_load = load('gb_model.joblib')

In [23]:
y_pred = model_load.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9550561797752809


### Test avec 20%

In [38]:
test_df = pd.read_csv('churn-bigml-20.csv')
data = test_df.copy()

In [39]:
label_encoder = LabelEncoder()
data['International plan'] = label_encoder.fit_transform(data['International plan'])
data['Voice mail plan'] = label_encoder.fit_transform(data['Voice mail plan'])
data['State'] = label_encoder.fit_transform(data['State'])
data['Churn'] = data['Churn'].astype(int)
# Préparation des données pour le modèle
X_test = data.drop(columns=['Churn','State'])
y_test = data['Churn']

In [40]:
X_test.shape

(667, 18)

In [41]:
y_pred = model_load.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9535232383808095
