In [13]:
import numpy as np
import pandas as pd

# pour afficher toutes les colonnes:
pd.options.display.max_columns = None

import matplotlib.pyplot as plt# Pie chart
import matplotlib.cm as cm

#nlp
from sklearn.feature_extraction.text import CountVectorizer

# modélisation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# évaluation modèle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [33]:
df = pd.read_csv("../dataset/data_modelisation.csv", sep=',')
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

data = pd.read_csv("../dataset/data_gp.csv", sep=',')
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,date,nos,ind,jour,semaine,mois,menu,gaspillage_volume,ferie,vacances,prevision,reel,site_nom,site_type,annee_scolaire,effectif,Longitude_Latitude
0,2011-01-03,1.0,0,Lundi,1,Janvier,"['Pamplemousse', 'Hachis Parmentier', 'Salade ...",1160,0,0,131.0,122.0,CHENE D'ARON,E,2010-2011,145.0,"[-1.56090982193118,47.2122056379041]"
1,2011-01-03,1.0,0,Lundi,1,Janvier,"['Pamplemousse', 'Hachis Parmentier', 'Salade ...",1160,0,0,73.0,58.0,CHENE D'ARON,M,2010-2011,78.0,"[-1.56031985366696,47.2120061437076]"
2,2011-01-03,1.0,0,Lundi,1,Janvier,"['Pamplemousse', 'Hachis Parmentier', 'Salade ...",1160,0,0,49.0,49.0,COTE D'OR,M,2010-2011,66.0,"[-1.57259938808227,47.2472636471014]"
3,2011-01-03,1.0,0,Lundi,1,Janvier,"['Pamplemousse', 'Hachis Parmentier', 'Salade ...",1160,0,0,265.0,241.0,AGENETS,M/E,2010-2011,295.0,"[-1.53043977895119,47.2276390419726]"
4,2011-01-03,1.0,0,Lundi,1,Janvier,"['Pamplemousse', 'Hachis Parmentier', 'Salade ...",1160,0,0,57.0,53.0,ALAIN FOURNIER,M,2010-2011,114.0,"[-1.60177038499681,47.2041450203042]"


## Construction des datasets

In [28]:
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

train_data = df['2015-09':'2018-07']
X_train = train_data[['effectif','porc','viande','poisson','bio','noel','an_chinois','plat_prepare','maison','laitage']]
y_train = train_data[['reel']]

test_data = df['2018-09':'2019-07']
X_test = test_data[['effectif','porc','viande','poisson','bio','noel','an_chinois','plat_prepare','maison','laitage']]
y_test = test_data[['reel']]

data=df.drop(['effectif'], axis=1)
X = data['2015-09':'2020-07']
y = X[['reel']]

In [29]:
# baseline
### RMSE de référence pour les prévisions, elles doivent être en dessous de ce 
### score pour que le modèle soit considéré commec correct

avg_fare = round(np.mean(y_train), 2)
baseline_pred = np.repeat(avg_fare, y_test.shape[0])
baseline_MSE = (mean_squared_error(baseline_pred, y_test))
baseline_RMSE = baseline_MSE **0.5
print("RMSE de référence sur les données de validation :", baseline_RMSE)

RMSE de référence sur les données de validation : 81.9106402048608


In [30]:
#On commence à entrainer le modèle à partir de l'année scolaire 2015-2016
#On test sur l'année 2018-2019

import statsmodels.api as sm

# train set
N_START = len(df) - len(df['2015-09':'2016-07'])- len(df['2016-09':'2017-07'])- len(df['2017-09':'2018-07'])- len(df['2018-09':'2019-07'])- len(df['2019-09':'2020-07'])
# test set
N_INT = len(df) - len(df['2018-09':'2019-07'])- len(df['2019-09':'2020-07'])-1
N_END = len(df) - len(df['2019-09':'2020-07'])-1

# construction du modèle de régression linéaire multiple de moindre carré ordinaire
mod = sm.regression.linear_model.OLS(y.iloc[N_START:N_INT], X.iloc[N_START:N_INT])

# lancement de l'entraînement
res = mod.fit()


print(res.summary())

dfm = df

# prédiction du modèle
y_pred = res.predict(X.iloc[N_INT:N_END]) 
y_true = y["reel"].iloc[N_INT:N_END]

                                 OLS Regression Results                                
Dep. Variable:                   reel   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          1.745e+33
Date:                Mon, 23 Sep 2024   Prob (F-statistic):                        0.00
Time:                        17:16:24   Log-Likelihood:                      1.6282e+05
No. Observations:                5746   AIC:                                 -3.256e+05
Df Residuals:                    5738   BIC:                                 -3.256e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------

In [35]:
df.head(1)

Unnamed: 0,date,reel,effectif,porc,viande,poisson,bio,noel,an_chinois,plat_prepare,maison,laitage
0,2011-01-03,122.0,145.0,0,1,0,0,0,0,1,0,0


In [34]:
# Préparation des données pour afficher la comparaison entre les valeurs prédites et estimées
X_pred = X.iloc[N_INT:N_END].join(data[["prevision", "reel", "date",'nos','ind']].set_index('date').groupby("date").sum())
X_pred["pred"] = y_pred

# temp.porc[(temp['porc'] > 1)] = 1

X_pred.nos[(X_pred['nos'] > 0 )] = X_pred.reel
X_pred.ind[(X_pred['ind'] >0)] = X_pred.reel
X_pred["gaspillage"] = (X_pred["prevision"] - X_pred["reel"]) / X_pred["prevision"]
X_pred["gaspi_pred"] = (X_pred.pred - X_pred.reel) / X_pred.pred

ValueError: columns overlap but no suffix specified: Index(['reel'], dtype='object')