# Importation des différentes bibliothèques

In [48]:
import pandas as pd
import numpy as np # Not always necessary
import matplotlib.pyplot as plt # Not always necessary
import seaborn as sns # Not always necessary
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression
from sklearn.tree import plot_tree
import plotly.express as px


# Phase d'exploration

In [27]:
##Importation du Dataset
tip = pd.read_csv("tip.csv")

##Visualisation du Dataset
tip.head()

Unnamed: 0.1,Unnamed: 0,Age,Employment_Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,0,31,Government Sector,Yes,400000,6,1,No,No,0
1,1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [28]:
##Visualisation des statistiques principale du Dataset
tip.describe()

Unnamed: 0.1,Unnamed: 0,Age,AnnualIncome,FamilyMembers,ChronicDiseases,TravelInsurance
count,1987.0,1987.0,1987.0,1987.0,1987.0,1987.0
mean,993.0,29.650226,932763.0,4.752894,0.277806,0.357323
std,573.741812,2.913308,376855.7,1.60965,0.44803,0.479332
min,0.0,25.0,300000.0,2.0,0.0,0.0
25%,496.5,28.0,600000.0,4.0,0.0,0.0
50%,993.0,29.0,900000.0,5.0,0.0,0.0
75%,1489.5,32.0,1250000.0,6.0,1.0,1.0
max,1986.0,35.0,1800000.0,9.0,1.0,1.0


In [29]:
##Visualisation du nom des colonnes
tip.columns

Index(['Unnamed: 0', 'Age', 'Employment_Type', 'GraduateOrNot', 'AnnualIncome',
       'FamilyMembers', 'ChronicDiseases', 'FrequentFlyer',
       'EverTravelledAbroad', 'TravelInsurance'],
      dtype='object')

In [30]:
##Recherche si des valeurs sont manquantes

tip.isnull().any()

Unnamed: 0             False
Age                    False
Employment_Type        False
GraduateOrNot          False
AnnualIncome           False
FamilyMembers          False
ChronicDiseases        False
FrequentFlyer          False
EverTravelledAbroad    False
TravelInsurance        False
dtype: bool

In [31]:
##Visualisation des différentes informations de l'ensemble de données, comme le type de données (numérique, catégorielle), colonne "Dtype" comme les valeurs manquantes, colonne "Non-Null Count"

tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987 entries, 0 to 1986
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           1987 non-null   int64 
 1   Age                  1987 non-null   int64 
 2   Employment_Type      1987 non-null   object
 3   GraduateOrNot        1987 non-null   object
 4   AnnualIncome         1987 non-null   int64 
 5   FamilyMembers        1987 non-null   int64 
 6   ChronicDiseases      1987 non-null   int64 
 7   FrequentFlyer        1987 non-null   object
 8   EverTravelledAbroad  1987 non-null   object
 9   TravelInsurance      1987 non-null   int64 
dtypes: int64(6), object(4)
memory usage: 155.4+ KB


In [32]:
##Recherche et visualisation des éventuels doublons

duplicateRows = tip[tip.duplicated()]  #recherche#
duplicateRows  #visualisation#

Unnamed: 0.1,Unnamed: 0,Age,Employment_Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance


# Phase de cleaning

In [33]:
##Suppression de la 1ere colonne "Unnamed: 0" qui n'apporte aucune info##
tip = tip.drop("Unnamed: 0", axis=1)

##Visualisation du Dataset après suppression de la colonne##
tip.head()

Unnamed: 0,Age,Employment_Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,400000,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,1250000,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,500000,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,700000,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,700000,8,1,Yes,No,0


In [34]:
##La colonne AnnualIncome étant en Roupies, conversion en Dollar
#Pour diviser les valeurs d'une colonne par un nombre
#(ici Roupies en Dollars / valeur arrétee au 13/03 = 82.82roupies=1dollar)
tip['AnnualIncome'] = tip['AnnualIncome'] / 82.82

##Visualisation du Dataset après changement de devise
tip.head()

Unnamed: 0,Age,Employment_Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,4829.751268,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,15092.972712,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,6037.189085,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,8452.064719,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,8452.064719,8,1,Yes,No,0


In [35]:
##Limiter à deux décimales la colonnes AnnualIncome et visualisation
tip['AnnualIncome'] = tip['AnnualIncome'].round(2)
tip.head()

Unnamed: 0,Age,Employment_Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,31,Government Sector,Yes,4829.75,6,1,No,No,0
1,31,Private Sector/Self Employed,Yes,15092.97,7,0,No,No,0
2,34,Private Sector/Self Employed,Yes,6037.19,4,1,No,No,1
3,28,Private Sector/Self Employed,Yes,8452.06,3,1,No,No,0
4,28,Private Sector/Self Employed,Yes,8452.06,8,1,Yes,No,0


# Visualisation du Dataset

In [36]:
#Visualisation de la proportion des personnes ayant souscrit une assurance voyage

px.pie(tip,
      names='TravelInsurance',
      color='TravelInsurance',
      color_discrete_map={1:'#E1812B', 0:'#377BA6'},
      title='Personne ayant souscrit à l assurance',
      width=1000,
     height=1000,
          )

In [37]:
#Visualisation de la souscription d'une assurance voyage en fonction du revenu annuel#

px.histogram(tip,
             x='AnnualIncome',
             color='TravelInsurance',
             color_discrete_map={1:'#E1812B', 0:'#377BA6'},
             title='Souscription d une assurance voyage en fonction du revenu annuel',
             width=1000,
             height=1000,
             )

In [38]:
#Visualisation de la souscription d'une assurance voyage en fonction du nombre de membres dans la famille

px.histogram(tip,
             x='FamilyMembers',
             color='TravelInsurance',
             color_discrete_map={1:'#E1812B', 0:'#377BA6'},
             title='Souscription d une assurance voyage en fonction du du nombre de membres dans la famille',
             width=1000,
             height=1000,
             )

In [39]:
#Visualisation du nombre de personnes ayant voyagé à l'étranger et ayant souscrit ou non à une assurance voyage

px.histogram(tip,
             x='EverTravelledAbroad',
             color='TravelInsurance',
             color_discrete_map={1:'#E1812B', 0:'#377BA6'},
             title='Nombre de personnes ayant voyagé à l étranger et ayant souscrit ou non à une assurance voyage',
             )

In [40]:
#Visualisation du nombre de personnes voyageant fréquemment ou pas et ayant souscrit ou non une assurance voyage

px.histogram(tip,
             x='FrequentFlyer',
             color='TravelInsurance',
             color_discrete_map={1:'#E1812B', 0:'#377BA6'},
             title='Nbre de personnes voyageant fréquemment ou pas et ayant souscrit ou non une assurance voyage',
             )

In [41]:
#Visualisation de la souscription d'une assurance voyage en fonction de l'âge des personnes#

px.histogram(tip,
             x='Age',
             color='TravelInsurance',
             color_discrete_map={1:'#E1812B', 0:'#377BA6'},
             title='Souscription d une assurance voyage en fonction de l âge des personnes',
             width=1000,
             height=1000,
             )