# Modele ML - Prédictions des sessions - Cultivar - Random Forest

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Formatage des données

In [50]:
df_session = pd.read_csv(r"C:\Users\Sarah\Desktop\Formation Databird\Projet\session_complete_ML.csv")

In [51]:
df_session['date'] = pd.to_datetime(df_session['date'])

In [103]:
df_session

Unnamed: 0,site,date,channel_grouping,device,sum(sessions)
0,Arboriculture Fruitière,2023-01-01,Direct,desktop,1
1,Arboriculture Fruitière,2023-01-01,Organic Search,desktop,6
2,Arboriculture Fruitière,2023-01-02,Direct,desktop,7
3,Arboriculture Fruitière,2023-01-02,Direct,tablet,1
4,Arboriculture Fruitière,2023-01-02,Organic Search,desktop,21
...,...,...,...,...,...
7744,Mon viti,2023-06-17,Organic Search,mobile,66
7745,Mon viti,2023-06-17,Organic Search,tablet,3
7746,Mon viti,2023-06-17,Referral,desktop,2
7747,Mon viti,2023-06-17,Referral,mobile,2


Isoler les donnnées pour Cultivar

In [55]:
df_cultivar = df_session[df_session['site'] == 'Cultivar']

In [58]:
df_cultivar

Unnamed: 0,site,date,channel_grouping,device,sum(sessions)
1029,Cultivar,2023-01-01,Direct,desktop,4
1030,Cultivar,2023-01-01,Direct,mobile,20
1031,Cultivar,2023-01-01,Email,desktop,2
1032,Cultivar,2023-01-01,Email,mobile,14
1033,Cultivar,2023-01-01,Email,tablet,6
...,...,...,...,...,...
3030,Cultivar,2023-06-17,Organic Search,mobile,61
3031,Cultivar,2023-06-17,Organic Search,tablet,1
3032,Cultivar,2023-06-17,Referral,desktop,1
3033,Cultivar,2023-06-17,Social,desktop,1


Normalisation des données

In [59]:
from sklearn.preprocessing import StandardScaler
standard_scal = StandardScaler()

In [60]:
#Conversion des jour en 1, 2, 3..., 365 Pour normaliser les dates (Attention possible car on est que sur l'année 2023)
df_cultivar['day_of_year'] = df_cultivar['date'].dt.day_of_year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cultivar['day_of_year'] = df_cultivar['date'].dt.day_of_year


In [64]:
df_cultivar = df_cultivar.drop(['date'], axis = 1)

In [65]:
df_cultivar

Unnamed: 0,site,channel_grouping,device,sum(sessions),day_of_year
1029,Cultivar,Direct,desktop,4,1
1030,Cultivar,Direct,mobile,20,1
1031,Cultivar,Email,desktop,2,1
1032,Cultivar,Email,mobile,14,1
1033,Cultivar,Email,tablet,6,1
...,...,...,...,...,...
3030,Cultivar,Organic Search,mobile,61,168
3031,Cultivar,Organic Search,tablet,1,168
3032,Cultivar,Referral,desktop,1,168
3033,Cultivar,Social,desktop,1,168


In [66]:
# select only numeric variables
num_cols = df_cultivar.select_dtypes(include="number").drop(columns='sum(sessions)').columns
print(num_cols)

Index(['day_of_year'], dtype='object')


In [67]:
df_cultivar_num = df_cultivar.copy()
df_cultivar_num = df_cultivar_num[num_cols]
print(num_cols)

Index(['day_of_year'], dtype='object')


In [68]:
df_cultivar[num_cols] = standard_scal.fit_transform(df_cultivar_num.values)

Encodage des variables catégorielles

In [69]:
categorical_variables = ["site", "channel_grouping", "device"]

In [71]:
# encoding des catégories de "site"
ohe_site = pd.get_dummies(df_cultivar["site"], prefix="site")
df_cultivar = pd.concat([df_cultivar, ohe_site], axis=1)

In [72]:
# encoding des catégories de "channel_grouping"
ohe_channel_grouping = pd.get_dummies(df_cultivar["channel_grouping"], prefix="channel")
df_cultivar = pd.concat([df_cultivar, ohe_channel_grouping], axis=1)

In [73]:
# encoding des catégories de "device"
ohe_device = pd.get_dummies(df_cultivar["device"], prefix="device")
df_cultivar = pd.concat([df_cultivar, ohe_device], axis=1)

In [76]:
cols_to_drop = ['site', 'channel_grouping', 'device']

In [77]:
df_cultivar = df_cultivar.drop(columns=cols_to_drop)

In [78]:
df_cultivar.dropna()

Unnamed: 0,sum(sessions),day_of_year,site_Cultivar,channel_Autopromotion,channel_Direct,channel_Email,channel_Googles News,channel_Organic Search,channel_Referral,channel_Social,device_desktop,device_mobile,device_smart tv,device_tablet
1029,4,-1.736570,1,0,1,0,0,0,0,0,1,0,0,0
1030,20,-1.736570,1,0,1,0,0,0,0,0,0,1,0,0
1031,2,-1.736570,1,0,0,1,0,0,0,0,1,0,0,0
1032,14,-1.736570,1,0,0,1,0,0,0,0,0,1,0,0
1033,6,-1.736570,1,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3030,61,1.714402,1,0,0,0,0,1,0,0,0,1,0,0
3031,1,1.714402,1,0,0,0,0,1,0,0,0,0,0,1
3032,1,1.714402,1,0,0,0,0,0,1,0,1,0,0,0
3033,1,1.714402,1,0,0,0,0,0,0,1,1,0,0,0


FIT - Modele Random Forest

In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [80]:
X = df_cultivar.drop("sum(sessions)", axis=1)
y = df_cultivar["sum(sessions)"]

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [82]:
rf_model = RandomForestRegressor(n_estimators=100)  # Vous pouvez spécifier le nombre souhaité d'estimateurs (arbres)

In [83]:
rf_model.fit(X_train, y_train)

In [84]:
y_pred = rf_model.predict(X_test)

In [85]:
# on évalue les prédictions avec le R2
score = r2_score(y_test, y_pred)
print(f"Le R2 est de {score:.1%}!")

Le R2 est de 72.4%!


In [86]:
rmse = mean_squared_error(y_test, y_pred, squared=False)

In [88]:
np.sqrt(rmse)

4.560227628895767

In [89]:
y_test

2834     72
1555     33
1422      3
2430     65
1462     31
       ... 
2909      4
1563    101
2019     33
1796     45
2389    118
Name: sum(sessions), Length: 502, dtype: int64

In [90]:
pd.Series(y_pred)

0      69.46
1      33.02
2       3.52
3      45.92
4      28.33
       ...  
497     2.01
498    80.77
499    72.07
500    28.44
501    99.88
Length: 502, dtype: float64

Génération d'un nouveau DataFrame Xnew pour lancer la prédiction

In [183]:
index = range(169, 366)

In [184]:
df = pd.DataFrame(index=index)

In [185]:
df

169
170
171
172
173
...
361
362
363
364
365


In [186]:
site = ['Cultivar']

In [187]:
channel_grouping = ['Direct', 'Organic Search', 'Social', 'Referral', 'Email', 'Googles News', 'Autopromotion']

In [188]:
device = ['desktop', 'tablet', 'mobile', 'smart tv']

In [189]:
from itertools import product

In [190]:
# Calcul du produit cartésien
result = list(product(index, site, channel_grouping, device))

In [191]:
df_prediction_cultivar = pd.DataFrame(result)

In [192]:
df_prediction_cultivar.rename(columns={0: 'day_of_year', 1: 'site', 2: 'channel_grouping', 3: 'device'}, inplace=True)

In [193]:
df_prediction_cultivar['sum(sessions)'] = 0

In [194]:
df_prediction_cultivar

Unnamed: 0,day_of_year,site,channel_grouping,device,sum(sessions)
0,169,Cultivar,Direct,desktop,0
1,169,Cultivar,Direct,tablet,0
2,169,Cultivar,Direct,mobile,0
3,169,Cultivar,Direct,smart tv,0
4,169,Cultivar,Organic Search,desktop,0
...,...,...,...,...,...
5511,365,Cultivar,Googles News,smart tv,0
5512,365,Cultivar,Autopromotion,desktop,0
5513,365,Cultivar,Autopromotion,tablet,0
5514,365,Cultivar,Autopromotion,mobile,0


In [195]:
df_prediction_cultivar_original = df_prediction_cultivar

In [196]:
df_prediction_cultivar_original = df_prediction_cultivar_original.drop('sum(sessions)', axis = 1)

Normalisation des données

In [197]:
# select only numeric variables
num_cols = df_prediction_cultivar.select_dtypes(include="number").drop(columns='sum(sessions)').columns
print(num_cols)

Index(['day_of_year'], dtype='object')


In [198]:
df_prediction_cultivar_num = df_prediction_cultivar
df_prediction_cultivar_num = df_prediction_cultivar_num[num_cols]

In [199]:
df_prediction_cultivar[num_cols] = standard_scal.fit_transform(df_prediction_cultivar_num.values)

Encodage des variables catégorielles

In [200]:
categorical_variables = ["site", "channel_grouping", "device"]

In [201]:
# encoding des catégories de "site"
ohe_site = pd.get_dummies(df_prediction_cultivar["site"], prefix="site")
df_prediction_cultivar = pd.concat([df_prediction_cultivar, ohe_site], axis=1)

In [202]:
# encoding des catégories de "channel_grouping"
ohe_channel_grouping = pd.get_dummies(df_prediction_cultivar["channel_grouping"], prefix="channel")
df_prediction_cultivar = pd.concat([df_prediction_cultivar, ohe_channel_grouping], axis=1)

In [203]:
# encoding des catégories de "device"
ohe_device = pd.get_dummies(df_prediction_cultivar["device"], prefix="device")
df_prediction_cultivar = pd.concat([df_prediction_cultivar, ohe_device], axis=1)

In [204]:
cols_to_drop = ['site', 'channel_grouping', 'device']

In [205]:
df_prediction_cultivar = df_prediction_cultivar.drop(columns=cols_to_drop)

In [206]:
df_prediction_cultivar.dropna()

Unnamed: 0,day_of_year,sum(sessions),site_Cultivar,channel_Autopromotion,channel_Direct,channel_Email,channel_Googles News,channel_Organic Search,channel_Referral,channel_Social,device_desktop,device_mobile,device_smart tv,device_tablet
0,-1.723281,0,1,0,1,0,0,0,0,0,1,0,0,0
1,-1.723281,0,1,0,1,0,0,0,0,0,0,0,0,1
2,-1.723281,0,1,0,1,0,0,0,0,0,0,1,0,0
3,-1.723281,0,1,0,1,0,0,0,0,0,0,0,1,0
4,-1.723281,0,1,0,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,1.723281,0,1,0,0,0,1,0,0,0,0,0,1,0
5512,1.723281,0,1,1,0,0,0,0,0,0,1,0,0,0
5513,1.723281,0,1,1,0,0,0,0,0,0,0,0,0,1
5514,1.723281,0,1,1,0,0,0,0,0,0,0,1,0,0


In [207]:
X_new = df_prediction_cultivar.drop(columns='sum(sessions)')

In [208]:
X_new

Unnamed: 0,day_of_year,site_Cultivar,channel_Autopromotion,channel_Direct,channel_Email,channel_Googles News,channel_Organic Search,channel_Referral,channel_Social,device_desktop,device_mobile,device_smart tv,device_tablet
0,-1.723281,1,0,1,0,0,0,0,0,1,0,0,0
1,-1.723281,1,0,1,0,0,0,0,0,0,0,0,1
2,-1.723281,1,0,1,0,0,0,0,0,0,1,0,0
3,-1.723281,1,0,1,0,0,0,0,0,0,0,1,0
4,-1.723281,1,0,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,1.723281,1,0,0,0,1,0,0,0,0,0,1,0
5512,1.723281,1,1,0,0,0,0,0,0,1,0,0,0
5513,1.723281,1,1,0,0,0,0,0,0,0,0,0,1
5514,1.723281,1,1,0,0,0,0,0,0,0,1,0,0


In [209]:
y_pred = rf_model.predict(X_new)

In [210]:
df_prediction_cultivar['sum(sessions)'] = y_pred

In [211]:
df_prediction_cultivar.head(15)

Unnamed: 0,day_of_year,sum(sessions),site_Cultivar,channel_Autopromotion,channel_Direct,channel_Email,channel_Googles News,channel_Organic Search,channel_Referral,channel_Social,device_desktop,device_mobile,device_smart tv,device_tablet
0,-1.723281,15.25,1,0,1,0,0,0,0,0,1,0,0,0
1,-1.723281,1.93,1,0,1,0,0,0,0,0,0,0,0,1
2,-1.723281,15.46,1,0,1,0,0,0,0,0,0,1,0,0
3,-1.723281,15.19,1,0,1,0,0,0,0,0,0,0,1,0
4,-1.723281,46.29,1,0,0,0,0,1,0,0,1,0,0,0
5,-1.723281,4.1,1,0,0,0,0,1,0,0,0,0,0,1
6,-1.723281,53.38,1,0,0,0,0,1,0,0,0,1,0,0
7,-1.723281,9.69,1,0,0,0,0,1,0,0,0,0,1,0
8,-1.723281,2.38,1,0,0,0,0,0,0,1,1,0,0,0
9,-1.723281,1.73,1,0,0,0,0,0,0,1,0,0,0,1


In [212]:
df_prediction_cultivar = pd.concat([df_prediction_cultivar_original, df_prediction_cultivar['sum(sessions)']], axis=1)

In [213]:
df_prediction_cultivar

Unnamed: 0,day_of_year,site,channel_grouping,device,sum(sessions)
0,169,Cultivar,Direct,desktop,15.25
1,169,Cultivar,Direct,tablet,1.93
2,169,Cultivar,Direct,mobile,15.46
3,169,Cultivar,Direct,smart tv,15.19
4,169,Cultivar,Organic Search,desktop,46.29
...,...,...,...,...,...
5511,365,Cultivar,Googles News,smart tv,2.24
5512,365,Cultivar,Autopromotion,desktop,1.14
5513,365,Cultivar,Autopromotion,tablet,1.33
5514,365,Cultivar,Autopromotion,mobile,1.81


In [214]:
import datetime

In [215]:
df_prediction_cultivar['date'] = df_prediction_cultivar['day_of_year'].apply(lambda x : datetime.date.fromordinal(x).strftime('2023-%m-%d'))

In [216]:
df_prediction_cultivar = df_prediction_cultivar.drop('day_of_year', axis = 1)

In [217]:
df_prediction_cultivar

Unnamed: 0,site,channel_grouping,device,sum(sessions),date
0,Cultivar,Direct,desktop,15.25,2023-06-18
1,Cultivar,Direct,tablet,1.93,2023-06-18
2,Cultivar,Direct,mobile,15.46,2023-06-18
3,Cultivar,Direct,smart tv,15.19,2023-06-18
4,Cultivar,Organic Search,desktop,46.29,2023-06-18
...,...,...,...,...,...
5511,Cultivar,Googles News,smart tv,2.24,2023-12-31
5512,Cultivar,Autopromotion,desktop,1.14,2023-12-31
5513,Cultivar,Autopromotion,tablet,1.33,2023-12-31
5514,Cultivar,Autopromotion,mobile,1.81,2023-12-31


In [219]:
df_prediction_cultivar.to_csv('session_prediction_cultivar.csv')

In [220]:
df_prediction_cultivar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5516 entries, 0 to 5515
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   site              5516 non-null   object 
 1   channel_grouping  5516 non-null   object 
 2   device            5516 non-null   object 
 3   sum(sessions)     5516 non-null   float64
 4   date              5516 non-null   object 
dtypes: float64(1), object(4)
memory usage: 215.6+ KB


In [221]:
df_session_ML = pd.read_csv(r"C:\Users\Sarah\Desktop\Formation Databird\Projet\session_complete_ML.csv")

In [222]:
df_session_ML.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7749 entries, 0 to 7748
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   site              7749 non-null   object
 1   date              7749 non-null   object
 2   channel_grouping  7749 non-null   object
 3   device            7749 non-null   object
 4   sum(sessions)     7749 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 302.8+ KB


In [223]:
df_session_ML['sum(sessions)'] = df_session_ML['sum(sessions)'].astype(float)

In [224]:
df_cultivar = df_session_ML[df_session_ML['site'] == 'Cultivar']

In [225]:
df_cultivar

Unnamed: 0,site,date,channel_grouping,device,sum(sessions)
1029,Cultivar,2023-01-01,Direct,desktop,4.0
1030,Cultivar,2023-01-01,Direct,mobile,20.0
1031,Cultivar,2023-01-01,Email,desktop,2.0
1032,Cultivar,2023-01-01,Email,mobile,14.0
1033,Cultivar,2023-01-01,Email,tablet,6.0
...,...,...,...,...,...
3030,Cultivar,2023-06-17,Organic Search,mobile,61.0
3031,Cultivar,2023-06-17,Organic Search,tablet,1.0
3032,Cultivar,2023-06-17,Referral,desktop,1.0
3033,Cultivar,2023-06-17,Social,desktop,1.0


In [226]:
df_concat_cultivar = pd.concat([df_cultivar, df_prediction_cultivar])

In [227]:
df_concat_cultivar

Unnamed: 0,site,date,channel_grouping,device,sum(sessions)
1029,Cultivar,2023-01-01,Direct,desktop,4.00
1030,Cultivar,2023-01-01,Direct,mobile,20.00
1031,Cultivar,2023-01-01,Email,desktop,2.00
1032,Cultivar,2023-01-01,Email,mobile,14.00
1033,Cultivar,2023-01-01,Email,tablet,6.00
...,...,...,...,...,...
5511,Cultivar,2023-12-31,Googles News,smart tv,2.24
5512,Cultivar,2023-12-31,Autopromotion,desktop,1.14
5513,Cultivar,2023-12-31,Autopromotion,tablet,1.33
5514,Cultivar,2023-12-31,Autopromotion,mobile,1.81


In [232]:
df_concat_cultivar.nunique()

site                   1
date                 365
channel_grouping       7
device                 4
sum(sessions)       2452
dtype: int64

In [229]:
df_concat_cultivar.to_csv('session_prediction_cultivar_complete.csv', index=False)