In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
tip = pd.read_csv("tip.csv")

## Regression linéaire

In [3]:
print("Separating labels from features...")
features_list = ["Age", "Employment_Type", "GraduateOrNot", "AnnualIncome", "FamilyMembers", "ChronicDiseases", "FrequentFlyer", "EverTravelledAbroad"]
X = tip.loc[:,features_list]
y = tip.loc[:,"TravelInsurance"]
print("...Done.")
print()

Separating labels from features...
...Done.



In [4]:
print("Splitting dataset into train set and test set...")
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)
print("...Done.")

Splitting dataset into train set and test set...
...Done.


In [5]:
print("#### X_train BEFORE preprocessing ####")
print(X_train.head())
print()
print("Encoding categorical features and standardizing numerical features...")
print()

numeric_features = [0,3,4,5]
numeric_transformer = StandardScaler()

categorical_features = [1,2,6,7] 
categorical_transformer = OneHotEncoder()

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),         
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = featureencoder.fit_transform(X_train)
print("...Done.")
print("#### X_train AFTER preprocessing ####")
print(X_train[0:5,:]) # print first 5 rows (not using iloc since now X_train became a numpy array)
print()

#### X_train BEFORE preprocessing ####
      Age               Employment_Type GraduateOrNot  AnnualIncome  \
159    27             Government Sector           Yes        900000   
799    31             Government Sector           Yes       1200000   
318    34             Government Sector           Yes       1100000   
1399   25  Private Sector/Self Employed            No        600000   
465    26  Private Sector/Self Employed           Yes       1500000   

      FamilyMembers  ChronicDiseases FrequentFlyer EverTravelledAbroad  
159               7                1            No                  No  
799               3                0            No                  No  
318               3                0            No                  No  
1399              3                1            No                  No  
465               5                0           Yes                 Yes  

Encoding categorical features and standardizing numerical features...

...Done.
#### X_train AF

In [6]:
    print("Train model...")
    regressor = LinearRegression()
    regressor.fit(X_train, y_train) # This steps is the actual training !
    print("...Done.")

Train model...
...Done.


In [7]:
print("--- Testing pipeline ---")

# Standardizing numeric features
print("Standardizing numerical features...")
print(X_test)
print()

X_test = featureencoder.transform(X_test)

print("...Done.")
print(X_test[:5]) # print first 5 rows (not using iloc since now X_test became a numpy array)
print()

# Predictions on test set
print("Predictions on test set...")
y_test_pred = regressor.predict(X_test)
print("...Done.")
print(y_test_pred[:5])
print()

--- Testing pipeline ---
Standardizing numerical features...
      Age               Employment_Type GraduateOrNot  AnnualIncome  \
1291   34  Private Sector/Self Employed           Yes       1500000   
1199   28             Government Sector           Yes       1200000   
1756   29  Private Sector/Self Employed           Yes        900000   
107    34  Private Sector/Self Employed           Yes        850000   
655    28  Private Sector/Self Employed           Yes        700000   
...   ...                           ...           ...           ...   
360    28  Private Sector/Self Employed            No        550000   
1799   26             Government Sector           Yes       1450000   
987    33  Private Sector/Self Employed           Yes       1400000   
692    31             Government Sector            No        300000   
1493   32             Government Sector           Yes        900000   

      FamilyMembers  ChronicDiseases FrequentFlyer EverTravelledAbroad  
1291         

In [8]:
# Performance assessment
print("--- Assessing the performances of the model ---")

# Print R^2 scores
print("R2 score on training set : ", regressor.score(X_train, y_train))
print("R2 score on test set : ", regressor.score(X_test, y_test))

--- Assessing the performances of the model ---
R2 score on training set :  0.2491294073264254
R2 score on test set :  0.254404272380789
