## Etude de cas Fortuneo

Une compagnie d’assurance santé va lancer un nouveau produit d’assurance automobile. Ils
ont besoin de votre aide pour créer un modèle permettant de prédire si les clients seront
intéressés par l'assurance automobile fournie par l’entreprise.
La construction du modèle de prédiction d’appétence d’assurance auto a un enjeu important
pour l'entreprise car elle permet de planifier en conséque

### Chargement des librairies

In [2]:
# librairies classiques

import numpy as np 
import pandas as pd
import warnings
import matplotlib.pyplot as plt 
warnings.filterwarnings('ignore')

# Librairies pour apprentissage 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Librairies pour faire du undersampling et du  oversampling
from collections import Counter
from imblearn.combine import SMOTEENN

### Chargement de la base de données et recodage des variables 

In [2]:
# Database 

df = pd.read_csv('data_train.csv', sep=',', decimal='.')
df = df.drop(columns=['id','Region_Code'])
df

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Female,45,1,0,1-2 Year,Yes,21406.0,124.0,22,0
1,Male,25,1,1,< 1 Year,No,58078.0,152.0,86,0
2,Female,57,1,1,1-2 Year,No,2630.0,26.0,214,0
3,Male,50,1,0,1-2 Year,Yes,2630.0,163.0,61,0
4,Female,21,1,0,< 1 Year,Yes,44589.0,152.0,131,0
...,...,...,...,...,...,...,...,...,...,...
346071,Female,37,1,0,1-2 Year,Yes,2630.0,156.0,218,0
346072,Male,54,1,0,> 2 Years,Yes,36962.0,122.0,36,0
346073,Female,25,1,1,< 1 Year,No,37004.0,152.0,294,0
346074,Male,22,1,1,< 1 Year,No,40338.0,152.0,67,0


In [3]:
#Recoding features 

df['Gender'] = df['Gender'].map({'Male': 1,'Female': 0})
df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1,'No': 0})
df['Vehicle_Age'] = df['Vehicle_Age'].map({'< 1 Year': 1,'1-2 Year': 2,'> 2 Years': 3})

In [4]:
#Data information 

display(df.shape)
display(df.head(10))

(346076, 10)

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,45,1,0,2,1,21406.0,124.0,22,0
1,1,25,1,1,1,0,58078.0,152.0,86,0
2,0,57,1,1,2,0,2630.0,26.0,214,0
3,1,50,1,0,2,1,2630.0,163.0,61,0
4,0,21,1,0,1,1,44589.0,152.0,131,0
5,0,31,1,1,1,0,31177.0,152.0,151,0
6,1,52,1,0,2,1,25708.0,156.0,295,0
7,1,42,1,0,2,1,33257.0,154.0,272,0
8,1,43,1,0,2,1,23433.0,152.0,252,0
9,1,32,1,0,1,1,2630.0,156.0,270,0


In [5]:
X = df.iloc[:,0:9]
Y = df.iloc[:, 9]

In [6]:
print("Poucentage de Non Favorable: {0:3f} %".format(100*sum(Y==0) / len(Y)))
print("Poucentage de Favorable: {0:3f} %".format(100*sum(Y==1) / len(Y)))

Poucentage de Non Favorable: 96.625886 %
Poucentage de Favorable: 3.374114 %


In [7]:
#summarize class distribution
print(Counter(Y))

Counter({0: 334399, 1: 11677})


##### Définition d'une fonction permettant d'equilibrer notre base de données.

In [15]:
def Transformation(X,Y):
    #define sampling strategy
    sample = SMOTEENN(sampling_strategy=0.5)
    #fit and apply the transform
    X_over, y_over = sample.fit_resample(X, Y)
    return(X_over, y_over)

In [16]:
X_over, y_over=Transformation(X,Y)

In [17]:
X_over

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,0,45,1,0,2,1,21406.000000,124.000000,22
1,1,25,1,1,1,0,58078.000000,152.000000,86
2,0,57,1,1,2,0,2630.000000,26.000000,214
3,1,50,1,0,2,1,2630.000000,163.000000,61
4,0,21,1,0,1,1,44589.000000,152.000000,131
...,...,...,...,...,...,...,...,...,...
380888,1,63,1,0,2,1,31746.372888,10.686444,149
380889,1,53,1,0,2,1,51107.317380,26.000000,281
380890,1,46,1,0,2,1,2630.000000,155.310817,65
380891,0,29,1,0,1,1,22983.858413,124.000000,260


In [18]:
#summarize class distribution
print(Counter(y_over))

Counter({0: 255580, 1: 125313})


#### Definition d'une fonction pour comparer plusieurs classifiers

In [19]:
#Def function classifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold,cross_val_score
import time

clfs = {
    'LR': LogisticRegression(solver='liblinear'),
    'ADA': AdaBoostClassifier(n_estimators=100,random_state=1),
    'ET': ExtraTreesClassifier(n_estimators=100,random_state=1),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'CART': DecisionTreeClassifier(criterion='gini',random_state=1),
    'ID3': DecisionTreeClassifier(criterion='entropy',random_state=1),
    'Stumb': DecisionTreeClassifier(criterion='gini',max_depth=1,random_state=1)
}

def run_classifieurs(X,Y,clfs):
    kf = KFold(n_splits=10, shuffle=True, random_state=1) 
    for i in clfs:
        clf = clfs[i]
        debut=time.time()
        cv_acc = cross_val_score(clf, X, Y, cv=kf,scoring='precision')
        cv_acc1 = cross_val_score(clf, X, Y, cv=kf,scoring='accuracy')

        fin=time.time()
        print("Precision for {0} is: {1:.3f} +/- {2:.3f} en {3:.3f}s".format(i,
                                                                            np.mean(cv_acc),
                                                                            np.std(cv_acc),fin-debut))
        print("Accuracy for {0} is: {1:.3f} +/- {2:.3f} en {3:.3f}s".format(i,
                                                                            np.mean(cv_acc1),
                                                                            np.std(cv_acc1),fin-debut))
        print('************************************************************************************************')

#### Test pour choisir les meilleurs classifiers

In [20]:
run_classifieurs(X_over, y_over,clfs)

Precision for LR is: 0.628 +/- 0.014 en 39.504s
Accuracy for LR is: 0.762 +/- 0.009 en 39.504s
************************************************************************************************
Precision for ADA is: 0.744 +/- 0.004 en 442.995s
Accuracy for ADA is: 0.859 +/- 0.002 en 442.995s
************************************************************************************************
Precision for ET is: 0.869 +/- 0.004 en 567.255s
Accuracy for ET is: 0.925 +/- 0.002 en 567.255s
************************************************************************************************
Precision for KNN is: 0.922 +/- 0.002 en 54.323s
Accuracy for KNN is: 0.967 +/- 0.001 en 54.323s
************************************************************************************************
Precision for CART is: 0.887 +/- 0.003 en 30.173s
Accuracy for CART is: 0.935 +/- 0.002 en 30.173s
************************************************************************************************
Precision for ID3 is: 0.890 

#### Evaluation des meilleures classifiers 

In [21]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X_over, y_over,test_size=0.3,random_state=1)

In [22]:
def classifieurs(Xtrain,Xtest,Ytrain,Ytest):
    print('****************KNN****************')
    KNN=KNeighborsClassifier(n_neighbors=5)
    KNN.fit(Xtrain,Ytrain)
    Y_KNN=KNN.predict(Xtest)

    display(confusion_matrix(Ytest,Y_KNN))
    print('Accuracy {0:.2f}%'.format(accuracy_score(Ytest,Y_KNN)*100))
    print('Précision {0:.2f}%'.format(precision_score(Ytest,Y_KNN)*100))
    print('Recall {0:.2f}%'.format(recall_score(Ytest,Y_KNN)*100))
    print('f1 {0:.2f}%'.format(f1_score(Ytest,Y_KNN)*100))
    
    print('****************ADA****************')
    ADA=AdaBoostClassifier(n_estimators=100,random_state=1)
    ADA.fit(Xtrain,Ytrain)
    Y_ADA=ADA.predict(Xtest)

    display(confusion_matrix(Ytest,Y_ADA))
    print('Accuracy {0:.2f}%'.format(accuracy_score(Ytest,Y_ADA)*100))
    print('Précision {0:.2f}%'.format(precision_score(Ytest,Y_ADA)*100))
    print('Recall {0:.2f}%'.format(recall_score(Ytest,Y_ADA)*100))
    print('f1 {0:.2f}%'.format(f1_score(Ytest,Y_ADA)*100))

    print('****************ET****************')
    ET=ExtraTreesClassifier(n_estimators=100,random_state=1)
    ET.fit(Xtrain,Ytrain)
    Y_ET=ET.predict(Xtest)

    display(confusion_matrix(Ytest,Y_ET))
    print('Accuracy {0:.2f}%'.format(accuracy_score(Ytest,Y_ET)*100))
    print('Précision {0:.2f}%'.format(precision_score(Ytest,Y_ET)*100))
    print('Recall {0:.2f}%'.format(recall_score(Ytest,Y_ET)*100))
    print('f1 {0:.2f}%'.format(f1_score(Ytest,Y_ET)*100))

    print('****************CART****************')
    DT=DecisionTreeClassifier(criterion='gini',random_state=1)
    DT.fit(Xtrain,Ytrain)
    Y_DT=DT.predict(Xtest)

    display(confusion_matrix(Ytest,Y_DT))
    print('Accuracy {0:.2f}%'.format(accuracy_score(Ytest,Y_DT)*100))
    print('Précision {0:.2f}%'.format(precision_score(Ytest,Y_DT)*100))
    print('Recall {0:.2f}%'.format(recall_score(Ytest,Y_DT)*100))
    print('f1 {0:.2f}%'.format(f1_score(Ytest,Y_DT)*100))
    
    print('****************ID3****************')
    ID3=DecisionTreeClassifier(criterion='entropy',random_state=1)
    ID3.fit(Xtrain,Ytrain)
    Y_ID3=ID3.predict(Xtest)

    display(confusion_matrix(Ytest,Y_ID3))
    print('Accuracy {0:.2f}%'.format(accuracy_score(Ytest,Y_ID3)*100))
    print('Précision {0:.2f}%'.format(precision_score(Ytest,Y_ID3)*100))
    print('Recall {0:.2f}%'.format(recall_score(Ytest,Y_ID3)*100))
    print('f1 {0:.2f}%'.format(f1_score(Ytest,Y_ID3)*100))

In [23]:
classifieurs(Xtrain,Xtest,Ytrain,Ytest)

****************KNN****************


array([[72197,  4457],
       [ 1077, 36537]], dtype=int64)

Accuracy 95.16%
Précision 89.13%
Recall 97.14%
f1 92.96%
****************ADA****************


array([[65840, 10814],
       [ 5250, 32364]], dtype=int64)

Accuracy 85.94%
Précision 74.95%
Recall 86.04%
f1 80.12%
****************ET****************


array([[71129,  5525],
       [ 3776, 33838]], dtype=int64)

Accuracy 91.86%
Précision 85.96%
Recall 89.96%
f1 87.92%
****************CART****************


array([[71754,  4900],
       [ 3403, 34211]], dtype=int64)

Accuracy 92.73%
Précision 87.47%
Recall 90.95%
f1 89.18%
****************ID3****************


array([[71899,  4755],
       [ 3355, 34259]], dtype=int64)

Accuracy 92.90%
Précision 87.81%
Recall 91.08%
f1 89.42%


#### Creation de Pipeline

In [4]:
from sklearn.pipeline import Pipeline
import pickle

In [25]:
P = Pipeline([('classifieur',KNeighborsClassifier(n_neighbors=5))])
P.fit(X_over, y_over)

pickle.dump(P,open('Scoring.pkl','wb'))

#### Validation sur les données de test 

In [26]:
#Données de test
df_test = pd.read_csv('data_test.csv', sep=',', decimal='.')
df_test = df_test.drop(columns=['id','Region_Code'])
df_test

df_test['Gender'] = df_test['Gender'].map({'Male': 1,'Female': 0})
df_test['Vehicle_Damage'] = df_test['Vehicle_Damage'].map({'Yes': 1,'No': 0})
df_test['Vehicle_Age'] = df_test['Vehicle_Age'].map({'< 1 Year': 1,'1-2 Year': 2,'> 2 Years': 3})


In [16]:
f=[[1,41,1,1,2,0,32379.0,124.0,277]]

In [17]:
P=pickle.load(open('Scoring.pkl','rb'))
Reponses=P.predict(f)
#P.predict_proba(df_test.values)

In [18]:
Reponses

array([0], dtype=int64)

In [29]:
np.unique(Reponses,return_counts=True)

(array([0, 1], dtype=int64), array([99499, 27538], dtype=int64))

In [30]:
Reponses=list(Reponses)

In [31]:
df_test['Response']=Reponses

In [32]:
df_test

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,41,1,1,2,0,32379.0,124.0,277,0
1,0,35,1,1,2,0,2630.0,160.0,61,0
2,0,25,1,0,1,0,2630.0,152.0,242,0
3,1,53,1,0,2,1,36242.0,163.0,253,0
4,1,27,1,1,1,0,38770.0,152.0,125,0
...,...,...,...,...,...,...,...,...,...,...
127032,1,25,1,1,1,0,36453.0,152.0,61,1
127033,1,56,1,0,2,1,35507.0,26.0,136,0
127034,0,35,1,0,2,1,29041.0,124.0,35,0
127035,1,55,1,1,2,0,63775.0,124.0,284,0


In [36]:
df_test.to_csv('Liste des clients avec prediction.csv')