## Paramètres en durs

### Dictionnaires Métriques

In [None]:
dico_tempo = {'w':[7, 'W-MON'], 'm': [30, 'M'], 'y': [365, 'Y']}
dico_metrics = {'explained_var':metrics.explained_variance_score,
               'max_error': metrics.max_error,
                'mae' : mean_absolute_error,
                'geometric_absolute_error' : geometric_mean_absolute_error,
                'geometric_mean_squared_error' : geometric_mean_squared_error,
                'mape' : mean_absolute_percentage_error,
               'mse' : mean_absolute_percentage_error}


### Dictionnaires de paramètres fixes des modèles

In [None]:
def get_liste_jferies(annee_debut=2017, annee_fin=2030):
  liste = []
  for annee in range(annee_debut, annee_fin+1):
    dico = JoursFeries.for_year(annee)
    for date in dico.values():
      liste.append(date)
  return liste

In [None]:
from datetime import datetime

# Dataframe des jours fériés pour Prophet
jferies = pd.DataFrame({
'holiday': 'jferies',
'ds': pd.to_datetime(get_liste_jferies(2017, 2040)),
'lower_window': 0,
'upper_window': 1,
 })

dico_params = {'Prophet' : {'growth' : ['linear'], 'yearly_seasonality' : [True], 'weekly_seasonality' : [True], 'daily_seasonality' : [False], 'seasonality_mode' : ['additive', 'multiplicative'], 'seasonality_prior_scale' : [10, 20, 30, 50], 'changepoint_prior_scale' : [0.02, 0.03, 0.04, 0.05]}, 
               
               'Prophet_opti' : {'seasonality_mode' : ['additive', 'multiplicative'], 'changepoint_prior_scale' : [0.05, 0.01, 0.005, 0.001], 'holidays_prior_scale' : [10, 1, 0.1], 'changepoint_range' : [0.7, 0.8], 'n_changepoints' : [10], 'yearly_seasonality' : [True], 'weekly_seasonality' : [True], 'holidays' : [jferies], 'growth' : ['linear']},
               
               'SARIMAX' : {'start_p': 0, 'start_q': 0 , 'max_p': 3, 'max_q': 3, 'start_P': 0, 'start_Q': 0, 'max_P': 2, 'max_D': 2, 'max_Q': 2, 'sp': 52}, 
              'MSTL' : {'level' : ['dtrend'], 'stochastic_trend': [False], 'stochastic_level': [False], 'seasonal' : [7],'irregular': [False], 'freq_seasonal' : [[{'period': 31, 'harmonics': 4}, {'period': 365, 'harmonics': 8}]]},
               'Expo' : {'trend':['additive', None], 'seasonal' :['additive'], 'sp' :[364]},  
              'Theta' :{'sp' : [364]}, 
              'RL' : {'fourier' : [4, 6, 8, 10, 12, 14, 16, 20, 24], 'start_date' : [datetime(2017,6,1), datetime(2018,1,1), datetime(2018,7,1), datetime(2019,1,1), datetime(2019,7,1)]}}

## Fonctions création base ou features

In [None]:
def abs_tempo(df_temp, tempo='w'):
  """ Prend en entrée le df df_temp etv^$(kk) retourne un df avec la DATE en index et y le taux d'absentéisme agrégé par temporalité """
  ts = df_temp.copy()
  ts = ts.sort_values('DATE')
  if tempo == 'd': 
    ts.index = ts['DATE']
    ts = ts.drop(['DATE', 'NB_ACTIFS', 'NB_INACTIFS'], axis=1)
    ts = ts.asfreq('d')
    ts.columns = ['y']
  else : 
    ts = ts.drop(columns=['TX_ABS'])
    ts['DATE'] = pd.to_datetime(ts['DATE']) - pd.to_timedelta(dico_tempo[tempo][0], unit='d')

    ts = ts.groupby([pd.Grouper(key='DATE', freq=dico_tempo[tempo][1])]).sum().reset_index().sort_values('DATE')
    ts['y'] = ts['NB_INACTIFS']/ts['NB_ACTIFS']
    ts.index = ts.DATE 
    ts = ts.drop(columns=['DATE'])
    ts.index.freq = dico_tempo[tempo][1]
    ts = ts.drop(columns=['NB_ACTIFS', 'NB_INACTIFS'])
  return(ts)

In [None]:
from jours_feries_france import JoursFeries

def is_ferie(row):
  return int(JoursFeries.is_bank_holiday(row.name, zone="Métropole"))

def is_ferie_demain(row):
  return int(JoursFeries.is_bank_holiday(row.name + timedelta(days=1), zone="Métropole"))

def is_ferie_apresdemain(row):
  return int(JoursFeries.is_bank_holiday(row.name + timedelta(days=2), zone="Métropole"))

def is_ferie_hier(row):
  return int(JoursFeries.is_bank_holiday(row.name - timedelta(days=1), zone="Métropole"))

def is_ferie_avanthier(row):
  return int(JoursFeries.is_bank_holiday(row.name - timedelta(days=2), zone="Métropole"))

def is_ferie_apresapresdemain(row):
   return int(JoursFeries.is_bank_holiday(row.name + timedelta(days=3), zone="Métropole"))

def is_ferie_avantavanthier(row):
   return int(JoursFeries.is_bank_holiday(row.name - timedelta(days=3), zone="Métropole"))
  
def period_to_datetime(period):
  return datetime(period.year, period.month, period.day)

def confinement(row):
  #intercept for linear model
  day = period_to_datetime(row.name)
  confinement1 = day >= datetime(2020,3,17) and day < datetime(2020,5,11)
  confinement2 = day >= datetime(2020,10,30) and day < datetime(2020,12,15)
  confinement3 = day >= datetime(2021,4,3) and day < datetime(2021,5,3)
  return int(confinement1 or confinement2 or confinement3)

def confinement1(row):
  start = datetime(2020,3,17)
  end = datetime(2020,5,11)
  day = period_to_datetime(row.name)
  if day >= start and day < end:
    a = (day - start).days
    b = (end - start).days
    return a/b
  else:
    return 0

def confinement2(row):
  start = datetime(2020,10,30)
  end = datetime(2020,12,15)
  day = period_to_datetime(row.name)
  if day >= start and day < end:
    a = (day - start).days
    b = (end - start).days
    return a/b
  else:
    return 0

def confinement3(row):
  start = datetime(2021,4,3)
  end = datetime(2021,5,3)
  day = period_to_datetime(row.name)
  if day >= start and day < end:
    a = (day - start).days
    b = (end - start).days
    return a/b
  else:
    return 0

def confinement1_intercept(row):
  start = datetime(2020,3,17)
  end = datetime(2020,5,11)
  day = period_to_datetime(row.name)
  return  int(day >= start and day < end)

def confinement2_intercept(row):
  start = datetime(2020,10,30)
  end = datetime(2020,12,15)
  day = period_to_datetime(row.name)
  return  int(day >= start and day < end)

def confinement3_intercept(row):
  start = datetime(2021,4,3)
  end = datetime(2021,5,3)
  day = period_to_datetime(row.name)
  return  int(day >= start and day < end)
  
def confinement123(row):
  start = datetime(2020,3,17)
  end = datetime(2020,5,11)
  if row.name >= start and row.name < end:
    a = (row.name - start).days
    b = (end - start).days
    return a/b
  else:
      start = datetime(2020,10,30)
      end = datetime(2020,12,15)
      if row.name >= start and row.name < end:
        a = (row.name - start).days
        b = (end - start).days
        return a/b
      else:
          start = datetime(2021,4,3)
          end = datetime(2021,5,3)
          if row.name >= start and row.name < end:
            a = (row.name - start).days
            b = (end - start).days
            return a/b
          else:
            return 0

def covid_changepoint(row):
  return int(row.name >= datetime(2020,3,17))

from vacances_scolaires_france import SchoolHolidayDates
d = SchoolHolidayDates()
def is_holidays(row):
  return int(d.is_holiday_for_zone(datetime.date(row.name), 'A') and d.is_holiday_for_zone(datetime.date(row.name), 'B') and d.is_holiday_for_zone(datetime.date(row.name), 'C'))

##Find Start Date

In [None]:
def create_split_dates(ts_, real_split_date):
  ts = ts_.copy()
  ts = ts.sort_values('DATE')
  ts.index = ts['DATE']
  ts = ts.drop(['DATE', 'NB_SALARIE'], axis=1)
  return pd.date_range(ts.index.min(), real_split_date, freq='m')

def find_start_date_trend(ts_, real_split_date):
  """
  Trouve la start_date à partir de laquelle les données sont potables en se basant sur la variation de tendace avec start_date <= real_split_date
  """
  
  #Preprocessing
  ts = ts_.copy()
  ts = ts.sort_values('DATE')
  ts.index = ts['DATE']
  ts = ts.drop(['DATE', 'NB_SALARIE'], axis=1)
  ts = ts.asfreq("D")
  ts.columns = ['y']
  confinement1 = pd.date_range('2020-03-10', '2020-05-10')
  confinement2 = pd.date_range('2020-10-30', '2020-12-14')
  confinement3 = pd.date_range('2021-04-03', '2021-05-02')
  ts.loc[confinement1] = np.nan
  ts.loc[confinement2] = np.nan
  ts.loc[confinement3] = np.nan
  ts = ts.fillna(method='backfill')
  
  #create splitdates
  split_dates = create_split_dates(ts_, real_split_date)
  best_split_date = split_dates[0]
  max_rupture = 0
  for split_date in split_dates:
    #Création des dates
    train_start = ts.index.min()
    train_end = split_date
    val_start = split_date + timedelta(days=1)
    val_end = ts.index.max()

    #Preprocessing
#     split_date_lend = pd.to_datetime(split_date) + timedelta(days=1)
#     fh_val = ForecastingHorizon(pd.period_range(start= split_date_lend, end=ts.index.max(), freq="d"), is_relative=False)
#     fh_train = ForecastingHorizon(pd.period_range(start=ts.index.min(), end=split_date, freq="d"), is_relative=False)
    y = ts[['y']]
    y.index = pd.period_range(start=ts.index.min(),end=ts.index.max(), freq="d")
    y_train, y_val = y[train_start:train_end], y[val_start:val_end]

    X_idx = pd.period_range(start=train_start,end=val_end , freq="d")
    train_idx = pd.period_range(start=train_start,end=train_end , freq="d")
    val_idx = pd.period_range(start=val_start,end=val_end , freq="d")

    dp = DeterministicProcess(
        index=X_idx,
        constant=False,               # dummy feature for bias (y-intercept)
        order=1,                     # trend (order 1 means linear)
        seasonal=False,               # weekly seasonality (indicators)
    )
    X = dp.in_sample() 
    X_train = X[train_start: train_end]
    X_val = X[val_start: val_end]

    #Evaluation sur la validation
    model_train = LinearRegression(fit_intercept=True)
    model_train.fit(X_train, y_train['y'])
    model_val = LinearRegression(fit_intercept=True)
    model_val.fit(X_val, y_val['y'])
    rupture = np.abs(model_train.coef_[0] - model_val.coef_[0])
    
    if rupture > max_rupture:
      max_rupture = rupture
      best_split_date = split_date

  
  return best_split_date

In [None]:
def find_start_date_variance(y, tune=0.5, cut=31):
  """
  Trouve la start_date à partir de laquelle les données sont potables en se basant sur la variation de variance avec start_date <= real_split_date
  
  tune vaut entre 0 et 1. Plus tune est élevé, plus on choisira une date tard dans le temps.
  """
  
  #Preprocessing
  ts = y.copy()
  ts['rolling_small'] = ts['y'].rolling(window=3, center=True).mean()
  ts['rolling'] = ts['y'].rolling(window=182, center=True, min_periods=91).mean()
  ts['variance'] = np.abs(ts['rolling_small'] - ts['rolling'])
  ts = ts.fillna(method='ffill')
  ts = ts.fillna(method='backfill')
  
  #Eventuellement mettre un if pour check return date <= split_date
  start_date = ts.index[(ts['variance'] >= ts['variance'].max() * tune).argmax()]
  total_length = len(pd.period_range(start_date, ts.index.max()))
  if total_length < 365:
    print(f"WARNING : La longueur du dataset d'entrainement est inférieur à un an : {total_length} jours")
    print("start_date = ", start_date)
  
#   print("start_date = ", start_date)
  return start_date

In [None]:
def preprocess_data(ts_, omicron = True, tune=0.5, cut=31):
  """
  Prétraite l'effet covid du 1er confinement et du variant omicron (debut 2022 jusqu'à mi fevrier 2022) et renvoie la serie temporelle commençant à partir de la date du début de déclaration. 
  Avec la méthode 'replace', cette fonction remplace le 1er pic du covid par la prédiction d'une régression linéaire avec 6 paires de series de fourier en s'entrainant sur la période  '2020-01-10' - '2020-03-10' union '2020-05-10' - '2020-07-10'
  Avec la méthode 'remove', cette fonction remplace le 1er pic du covid par des NaN pour les modèles qui peuvent traiter des valeurs manquantes.
  La sortie est un dataframe d'index de type pd.period_range(start=start,end=end , freq="d").
  
  Paramètres :
  
  ts_ : dataframe
    Série temporelle format pickle 2
  
  method : {'replace', 'remove', 'nothing'}
    Désigne la méthode de traitement du covid
    
  omicron : bool
    Si True, traiter le pic d'absenteisme du à Omicron en 2022
    
  cut : int
    Coupe les 'cut' derniers pas de temps à cause de la date de fin de déclaration
    
  """
  
  #Preprocessing
  ts = ts_.copy()
  ts = ts.sort_values('DATE')
  ts.index = ts['DATE']
  ts = ts.drop(['DATE', 'NB_ACTIFS', 'NB_INACTIFS'], axis=1)
  ts = ts.asfreq("D")
  ts.columns = ['y']
  
  #Création des dates
  start = ts.index.min()
  
  confinement1_deb = pd.to_datetime('2020-03-10')
  confinement1_fin = pd.to_datetime('2020-05-10')
  train1_start = pd.to_datetime('2020-01-10')
  train1_end = confinement1_deb - timedelta(days=1)
  train1_after_start = confinement1_fin + timedelta(days=1)
  train1_after_end = pd.to_datetime('2020-07-10')
  val1_start = confinement1_deb
  val1_end = confinement1_fin
  
  confinement2_deb = pd.to_datetime('2021-12-24')
  confinement2_fin = pd.to_datetime('2022-02-14')
  train2_start = pd.to_datetime('2021-10-01')
  train2_end = confinement2_deb - timedelta(days=1)
  train2_after_start = confinement2_fin + timedelta(days=1)
  train2_after_end = pd.to_datetime('2022-03-14')
  val2_start = confinement2_deb
  val2_end = confinement2_fin
  
  end = ts.index.max()

  #Preprocessing
  y = ts['y']
  y.index = pd.period_range(start=ts.index.min(),end=ts.index.max(), freq="d")
  real = y.copy()
  X_idx = pd.period_range(start=start, end=end , freq="d")
  
  train1_idx = pd.period_range(start=train1_start, end=train1_end, freq="d").union(pd.period_range(start=train1_after_start, end=train1_after_end, freq="d"))
  val1_idx = pd.period_range(start=val1_start, end=val1_end, freq="d")
  
  train2_idx = pd.period_range(start=train2_start, end=train2_end, freq="d").union(pd.period_range(start=train2_after_start, end=train2_after_end, freq="d"))
  val2_idx = pd.period_range(start=val2_start, end=val2_end, freq="d")
  
  #Création du modèle final 
  fourier = CalendarFourier(freq="M", order=6)  #number of sin/cos pairs for "A"nnual seasonality
  dp = DeterministicProcess(
      index=X_idx,
      constant=False,               # dummy feature for bias (y-intercept)
      order=1,                     # trend (order 1 means linear)
      seasonal=True,               # weekly seasonality (indicators)
      additional_terms=[fourier],  # annual seasonality (fourier)
      drop=True,                   # drop terms to avoid collinearity
  )
  X = dp.in_sample() 
  X['isferie'] = X.apply(is_ferie, axis=1)
  X['isferie_demain'] = X.apply(is_ferie_demain, axis=1)
  X['isferie_hier'] = X.apply(is_ferie_hier, axis=1)
  X['isferie_apresdemain'] = X.apply(is_ferie_apresdemain, axis=1)
  X['isferie_avanthier'] = X.apply(is_ferie_avanthier, axis=1)
  X['isferie_apresapresdemain'] = X.apply(is_ferie_apresapresdemain, axis=1)
  X['isferie_avantavanthier'] = X.apply(is_ferie_avantavanthier, axis=1)

  #1er confinement
  try:
    X_train1 = X.loc[train1_idx]
    X_val1 = X[val1_start: val1_end]
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train1, y.loc[train1_idx])
    y_pred1 = model.predict(X_val1)
    y_pred1 = pd.Series(y_pred1, name='y_pred1', index=X_val1.index)
    y.loc[val1_idx] = y_pred1
  except Exception:
    pass
  #2eme confinement
  if omicron:
    try:
      X_train2 = X.loc[train2_idx]
      X_val2 = X[val2_start: val2_end]
      model = LinearRegression(fit_intercept=True)
      model.fit(X_train2, y.loc[train2_idx])
      y_pred2 = model.predict(X_val2)
      y_pred2 = pd.Series(y_pred2, name='y_pred2', index=X_val2.index)
      y.loc[val2_idx] = y_pred2
    except Exception:
      pass

  #Décommenter la ligne en bas pour comparer avec la série temporelle initiale
#     y['real'] = real

  #Debut des declarations
  y = y.to_frame('y')
  y = y.iloc[:-cut]
  start_date = find_start_date_variance(y, tune=tune, cut=cut)
  y = y[start_date:]

  #partie remove
  y_nan = y.copy()
#   val1_idx = pd.period_range(start=val1_start, end=val1_end, freq="d")
#   y_nan.loc[val1_idx] = np.nan
#   if omicron:
#     val2_idx = pd.period_range(start=val2_start, end=val2_end, freq="d")
#     y_nan.loc[val2_idx] = np.nan

  #partie nothing
#   print("real_start_date = ", ts.index.min())
  y_real = ts['y']
  y_real = y_real.to_frame('y')
  y_real = y_real[start_date:]
  y_real = y_real.iloc[:-cut]

  return y, y_nan, y_real

## SARIMAX

In [None]:
import re

In [None]:
def create_features_sarimax(idx):
  X = pd.DataFrame({'DATE' : idx})
  X.index = X.DATE
  X['isferie'] = X.apply(is_ferie, axis=1)
  X['isferie_demain'] = X.apply(is_ferie_demain, axis=1)
  X['isferie_hier'] = X.apply(is_ferie_hier, axis=1)
  X['isferie_apresdemain'] = X.apply(is_ferie_apresdemain, axis=1)
  X['isferie_avanthier'] = X.apply(is_ferie_avanthier, axis=1)
  X['isferie_apresapresdemain'] = X.apply(is_ferie_apresapresdemain, axis=1)
  X['isferie_avantavanthier'] = X.apply(is_ferie_avantavanthier, axis=1)
  X = X.drop(columns=['DATE'])
  X.index = X.index.to_timestamp()
  X = X.rolling(window=7, center=True).mean()
  X = X.apply(lambda x: x*7)
  fourier = CalendarFourier(freq="A", order=10)  #number of sin/cos pairs for "A"nnual seasonality
  dp = DeterministicProcess(
      index=idx,
      constant=False,               # dummy feature for bias (y-intercept)
      order=1,                     # trend (order 1 means linear)
      seasonal=True,               # weekly seasonality (indicators)
      additional_terms=[fourier],  # annual seasonality (fourier)
      drop=False,                   # drop terms to avoid collinearity
  )
  Y = dp.in_sample() 
  Y = Y.iloc[:,8:]
  Y.index = Y.index.to_timestamp()
  Z = X.merge(Y,how='inner', left_index=True, right_index=True)
  Z = Z.fillna(0)
  return Z

In [None]:
def model_sarimax(timeseries,val_size, steps=365): 
  
  param = dico_params['SARIMAX']
    
  #Retraitement + calcul de la base agrégée par semaine 
  pred_dico = {'val_size' : val_size, 'steps' : int(steps/7)}
  ts,_,_ = preprocess_data(timeseries)
  ts.index = ts.index.to_timestamp()
  ts_merged = ts.merge(timeseries, how='inner', left_index=True, right_on = 'DATE')
  ts_merged['NB_INACTIFS'] = ts_merged['y']*ts_merged['NB_ACTIFS']
  ts_merged = ts_merged.drop(columns=['y', 'TX_ABS']).reset_index(drop=True)
  ts_merged['DATE'] = pd.to_datetime(ts_merged['DATE']) - pd.to_timedelta(7, unit='d')
  ts = ts_merged.groupby([pd.Grouper(key='DATE', freq='W')])['NB_ACTIFS', 'NB_INACTIFS'].sum()
  ts['y'] = ts['NB_INACTIFS']/ts['NB_ACTIFS']
  ts = ts.drop(columns=['NB_ACTIFS', 'NB_INACTIFS'])
  y = ts[['y']]
  y_train, y_val = temporal_train_test_split(y, test_size = pred_dico['val_size'])
  
  #horizons de prédiction
  fh_train = ForecastingHorizon(y_train.index, is_relative=False)
  fh_val = ForecastingHorizon(y_val.index, is_relative=False)
  fh_test = ForecastingHorizon(pd.date_range(start=y_val.index.max() + timedelta(7) ,periods=pred_dico['steps'] , freq="W-SUN"), is_relative=False)
  df_train = y_train.copy()
  df_val = y_val.copy()
  
  #AutoARIMA pour recherche des best_params 
  forecaster = AutoARIMA(start_p=param['start_p'], start_q = param['start_q'],max_p = param['max_p'], max_q=param['max_q'], start_P = param['start_P'],start_Q = param['start_Q'],
                                    max_P=param['max_P'], max_D = param['max_D'],max_Q = param['max_Q'],sp = param['sp'], n_jobs=-1)
  forecaster.fit(y_train)
  
  summary_string = str(forecaster.summary())
  try: 
    param = re.findall('SARIMAX\(([0-9]+), ([0-9]+), ([0-9]+)',summary_string)
    ar_order = (int(param[0][0]) , int(param[0][1]) , int(param[0][2]))
  except : 
    ar_order = (0,0,0)

  try : 
    param_s = re.findall('x\(([0-9]+), ([0-9]+), ([0-9]+), ([0-9]+)',summary_string)
    season_order = (int(param_s[0][0]) , int(param_s[0][1]) , int(param_s[0][2]), int(param_s[0][3]))
  except : 
    try:
      param_s = re.findall('x\(([0-9]+), ([0-9]+), \[([0-9]+)\], ([0-9]+)',summary_string)
      season_order = (int(param_s[0][0]) , int(param_s[0][1]) , int(param_s[0][2]), int(param_s[0][3]))
    except : 
      season_order =(0,0,0,0)
  dict_bp = {'order' : ar_order, 'seasonal_order' : season_order}
  
  train_start = y_train.index.min()
  train_end = y_train.index.max()
  val_start = y_val.index.min()
  val_end = y_val.index.max()
  test_start = val_end + timedelta(days=7)
  test_end = val_end + timedelta(days=7) + timedelta(days=steps)
  X_idx = pd.period_range(start=train_start,end=test_end , freq="d")
  
  features = create_features_sarimax(X_idx)
  y_features = y.merge(features, how='inner', left_index=True, right_index = True).drop(columns=['y'])
  train_features = y_train.merge(features, how='inner', left_index=True, right_index = True).drop(columns=['y'])
  val_features = y_val.merge(features, how='inner', left_index=True, right_index = True).drop(columns=['y'])
  test_features = pd.DataFrame(index=pd.date_range(start=y_val.index.max() + timedelta(7) ,periods=pred_dico['steps'], freq="W-SUN")).merge(features, how='inner', left_index=True, right_index = True)
  
  forecaster = SARIMAX(order=dict_bp['order'], seasonal_order=dict_bp['seasonal_order'])
  forecaster.fit(y_train, train_features)
  train = forecaster.predict(fh_train, X =train_features).rename(columns = {'y' : 'pred'})
  val = forecaster.predict(fh=fh_val, X=val_features).rename(columns = {'y' : 'pred'})
  forecaster = SARIMAX(order=dict_bp['order'], seasonal_order=dict_bp['seasonal_order'])
  forecaster.fit(y, y_features)
  test = forecaster.predict(fh_test, X =test_features).rename(columns = {'y' : 'pred'})

  best_params = dict_bp
 
  train.index = train.index.to_period(freq='d')
  val.index = val.index.to_period(freq='d')
  test.index = test.index.to_period(freq='d')
   
  
  return(best_params, train, val, test)

## Modèle RL

In [None]:
def create_features(idx, covid_=False, drop=True, mois_=False, order=10):
  fourier = CalendarFourier(freq="A", order=order)  #number of sin/cos pairs for "A"nnual seasonality
  dp = DeterministicProcess(
      index=idx,
      constant=False,               # dummy feature for bias (y-intercept)
      order=1,                     # trend (order 1 means linear)
      seasonal=True,               # weekly seasonality (indicators)
      additional_terms=[fourier],  # annual seasonality (fourier)
      drop=drop,                   # drop terms to avoid collinearity
  )
  X = dp.in_sample() 
  X['isferie'] = X.apply(is_ferie, axis=1)
  X['isferie_demain'] = X.apply(is_ferie_demain, axis=1)
  X['isferie_hier'] = X.apply(is_ferie_hier, axis=1)
  X['isferie_apresdemain'] = X.apply(is_ferie_apresdemain, axis=1)
  X['isferie_avanthier'] = X.apply(is_ferie_avanthier, axis=1)
  X['isferie_apresapresdemain'] = X.apply(is_ferie_apresapresdemain, axis=1)
  X['isferie_avantavanthier'] = X.apply(is_ferie_avantavanthier, axis=1)
  if covid_:
    X['covid'] = X.apply(covid, axis=1)
  if mois_:
    X['mois'] = X.apply(mois, axis=1)
    X['day'] = X.apply(day, axis=1)
#     X['confinement'] = X.apply(confinement, axis=1)
#   X['confinement1'] = X.apply(confinement1, axis=1)
#   X['confinement2'] = X.apply(confinement2, axis=1)
#   X['confinement3'] = X.apply(confinement3, axis=1)
#   X['confinement1_intercept'] = X.apply(confinement1_intercept, axis=1)
#   X['confinement2_intercept'] = X.apply(confinement2_intercept, axis=1)
#   X['confinement3_intercept'] = X.apply(confinement3_intercept, axis=1)
  #---Features supplémentaires non utilisés car peuvent empirer la prédiction---
  # X['confinement123'] = X.apply(confinement123, axis=1)
  # X['is_confinement1'] = X.apply(is_confinement1, axis=1)
  # X['is_confinement2'] = X.apply(is_confinement2, axis=1)
  # X['is_confinement3'] = X.apply(is_confinement3, axis=1)
  # X['covid_changepoint'] = X.apply(covid_changepoint, axis=1)
  # X['isholidays'] = X.apply(is_holidays, axis=1)  #pas vraiment utile, redondant avec les vacs
  return X

In [None]:
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.linear_model import LinearRegression, Lasso

def PiecewiseLinearRegression(y, steps, window=365, n_changepoints=25, alpha=0.000005, verbose=False):
  """
  Prend en argument la série temporelle prétraité et renvoie la prédiction (modèle de regression linéaire avec tendance affine par morceaux très similaire à prophet) sur le train et sur le val sur une durée de "steps" pas de temps. 
  
  Parametres :
  
  y : dataframe
    série temporelle prétraitée avec preprocess data
  steps : int
    nombre de pas de temps dans le futur sur lesquels effectuer la prédiction
  window : int
    on modélise la tendance en modèlisant la moyenne glissée sur un fenetre de taille window par une fonction affine par morceaux
  n_changepoints : int
    comme prophet on réparti n_changepoints dates de manière uniforme sur le dataset comme potentiels changepoints
  alpha : float
    paramètre alpha pour le modèle Lasso pour sélectionner les dates utilisées
  verbose : bool
    afficher la prediction sur le train vs la valeur reelle
  """
  y = y['y']
  y_smooth = y.rolling(window=window, center=True, min_periods=100).mean() #param
  y_smooth = y_smooth.dropna()
  idx = pd.period_range(start=y.index.min(), periods=len(y) + steps, freq="d")
  test_idx = pd.period_range(start=y.index.max() + timedelta(days=1), periods=steps, freq="d")

  dp = DeterministicProcess(
      index=idx,
      constant=False,               # dummy feature for bias (y-intercept)
      order=1,                     # trend (order 1 means linear)
      seasonal=False,               # weekly seasonality (indicators)
  )

  X = dp.in_sample()  # create features for dates in tunnel.index

  changepoints = y_smooth.index[len(y_smooth.index)//(2*n_changepoints) : -len(y_smooth.index)//(2*n_changepoints) :len(y_smooth.index)//(n_changepoints)]
  for i in range(len(changepoints)):
    X['changepoint_trend_' + str(i)] = 0
    n = len(pd.period_range(start=changepoints[i], end=idx.max(), freq='d'))
    X.loc[changepoints[i]:, 'changepoint_trend_' + str(i)] = np.arange(n)
  X = X / len(y.index)
  X['trend'] *= 1000

  model = Lasso(alpha=alpha) #param
  #model = LinearRegression()
  model.fit(X.loc[y_smooth.index], y_smooth) #on entraine le model sur la partie smooth
  train_trend = model.predict(X.loc[y.index])
  test_trend = model.predict(X.loc[test_idx])
  train_trend = pd.DataFrame(data=train_trend, index=y.index, columns=['pred'])
  test_trend = pd.DataFrame(data=test_trend, index=test_idx, columns=['pred'])
  
  X_feats = create_features(idx)
  model = LinearRegression()
  model.fit(X_feats.loc[y.index], y - train_trend['pred'])
  train_pred = train_trend + pd.DataFrame(data=model.predict(X_feats.loc[y.index]), index=y.index, columns=['pred'])
  test_pred = test_trend + pd.DataFrame(data=model.predict(X_feats.loc[test_idx]), index=test_idx, columns=['pred'])
  
  if verbose:
    train_pred['real'] = y
    ax = df.plot()
    df_test.plot(ax=ax)
    plt.show()
  
  return train_pred, test_pred


def model_rl(y, y_train, y_val, dico_pred):
  """
  Modèle de regression linéaire très similaire à prophet avec une modélisation affine par morceaux de la tendance
  Il ne possède aucun hyperparamètre à tuner.
  """
  
  steps = dico_pred['steps']
  train, val = PiecewiseLinearRegression(y_train, len(y_val))
  _, test = PiecewiseLinearRegression(y, steps)
  best_params = {}
  
  return(best_params, train, val, test)

## Modèle Prophet

In [None]:
def model_prophet_opti(y, y_train, y_val, dico_pred): 
  
  y.index = y.index.to_timestamp()
  y_val.index = y_val.index.to_timestamp()
  y_train.index = y_train.index.to_timestamp()

  cv = ExpandingWindowSplitter(initial_window = 2*365, step_length = int((len(y)-(200))/5),fh = 200)
  param_grid = dico_params['Prophet']
  gscv = ForecastingGridSearchCV(Prophet(), cv=cv, param_grid=param_grid,verbose=True, scoring = MeanSquaredError(square_root=True), n_jobs = -1)
  
  try : 
    gscv_train = gscv.fit(y, fh = y_train.index)
  except : 
    print("Il faut réduire horizon de prédiction")
    
  
  best_params = gscv_train.best_params_
  
  prophet_mdl = Prophet().set_params(**best_params).fit(y_train, fh=y_val.index)
  y_val_ = prophet_mdl.predict(fh=y_val.index).rename(columns={'y' : 'pred'})
  train = prophet_mdl.predict(fh=y_train.index).rename(columns={'y' : 'pred'})
  
  prophet_final =   Prophet().set_params(**best_params).fit(y)
  data_range = pd.DatetimeIndex(data = pd.period_range(start=pd.to_datetime(y.index.max()) + timedelta(days=1), periods = dico_pred['steps'],freq='d').to_timestamp())
  y_test = prophet_final.predict(pd.DatetimeIndex(data = data_range, freq="d"))
  y_test = y_test.rename(columns = {'y' : 'pred'})
  
  """df_val_ = pd.DataFrame(prophet_mdl.predict_interval(fh =pd.DatetimeIndex(data = y_val.index, freq="d"), coverage=0.9)['Coverage'][0.9]).rename(columns= {'lower' : 'y_min', 'upper': 'y_max'})
  df_test_ = pd.DataFrame(prophet_final.predict_interval(fh =pd.DatetimeIndex(data = data_range, freq="d"), coverage=0.9)['Coverage'][0.9]).rename(columns= {'lower' : 'y_min', 'upper': 'y_max'})
  val = pd.concat([df_val_, y_val_], axis=1)
  test = pd.concat([y_test, df_test_], axis=1)
  """ 
  
  val = y_val_
  test = y_test
  train.index = y_train.index.to_period()
  val.index = y_val.index.to_period()
  test.index =  pd.period_range(start=pd.to_datetime(y.index.max()) + timedelta(days=1), periods = dico_pred['steps'],freq='d')
  
  y.index = y.index.to_period()
  y_val.index = y_val.index.to_period()
  y_train.index = y_train.index.to_period()
  
  return(best_params, train, val, test)

##Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 

In [None]:
def plot_ts_df(train, val, test, name, figsize=(20,20), wdw_1 = 35, wdw_2 = 1):
  
  col_train = [x for x in train.columns if ('_min_' not in x) and ('max_' not in x)] 
  col_val = [x for x in val.columns if ('_min_' not in x) and ('max_' not in x)] 
  col_test = [x for x in test.columns if ('_min_' not in x) and ('max_' not in x)] 
  
  df_train = train[col_train]
  df_train.index = df_train.index.to_timestamp()
  
  df_val = val[col_val]
  df_val.index = df_val.index.to_timestamp()
  
  df_test = test[col_test]
  df_test.index = df_test.index.to_timestamp()
  
  list_colors = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", 'g', 'r', 'y']
  
  fig, ax = plt.subplots(nrows=2, ncols=1, figsize=figsize)
  ax[0].set_title('Taux absentéisme : observé vs prédit pour le client : ' + name,fontsize=30, pad=20)
  ax[0].set_ylabel('Taux absentéisme')
  
  ax[0].set_prop_cycle('color',list_colors[:(df_train.drop(columns='y')).shape[1]])
  ax[1].set_prop_cycle('color',list_colors[:(df_train.drop(columns='y')).shape[1]])
      
  ax[0].plot(df_train.y.rolling(window=wdw_1).mean(), label='y', linewidth=3, linestyle=':', color='darkblue')
  ax[0].plot(df_train.drop(columns='y').rolling(window=wdw_1).mean(),label=list(df_train.drop(columns='y').columns))
  ax[0].plot(df_test.rolling(window=wdw_1).mean())
  ax[0].legend(loc='best')
  ax[0].plot(df_val.y.rolling(window=wdw_1).mean(), label='y', linewidth=3, linestyle=':', color='darkblue')
  ax[0].plot(df_val.drop(columns='y').rolling(window=wdw_1).mean(),
              label=list(df_val.drop(columns='y').columns))
  min_y_values = min([df_val.min().min(),df_train.min().min(),df_test.min().min()])
  max_y_values = max([df_val.max().max(),df_train.max().max(),df_test.max().max()])
  ax[0].vlines(df_val.index.min() + pd.Timedelta(wdw_1/2,'D'), min_y_values, max_y_values,color='darkblue')
  ax[0].vlines(df_test.index.min() + pd.Timedelta(wdw_1/2,'D'), min_y_values, max_y_values,color='darkblue')
  ax[0].text(df_train.index.mean(),max_y_values,'Train',size=20,color='darkblue')
  ax[0].text(df_val.index.mean(),max_y_values,'Val',size=20,color='darkblue')
  ax[0].text(df_test.index.mean(),max_y_values,'Futur',size=20,color='darkblue')


  ax[1].plot(df_val.y.rolling(window=wdw_2).mean(), label='y', linewidth=3, linestyle=':', color='darkblue')
  ax[1].plot(df_val.drop(columns='y').rolling(window=wdw_2).mean(),
              label=list(df_val.drop(columns='y').columns))
  ax[1].set_xlim([df_val.index.min(),df_val.index.min() + pd.Timedelta(182,'D')])
  ax[1].legend(loc='best')
  ax[1].set_title('Zoom sur le Val',fontsize=30, pad=20)

## Calcul métriques

In [None]:
"""def compute_all_metrics(df_res_train, df_res_val,dico_metrics,dico_models, temporalite=['d','w','m']):
  dico_errors = {}
  for t in temporalite:
    list_models = list(dico_models.keys())      
    variables_train = ['pred_' + key for key in list_models] + ['y'] 
    variables_train = [x for x in variables_train if x not in ['y_train_pred_max_TBATS', 'y_train_pred_min_TBATS']]
    variables = ['pred_' + key for key in list_models] + ['y_min_' + key for key in list_models] + ['y_max_' + key for key in list_models] + ['y']
    variables_val = [x for x in variables if x not in ['y_min_blending', 'y_max_blending', 'y_min_SARIMAX', 'y_max_SARIMAX']]
    agg_var_train = {key:'mean' for key in variables_train}

    #variables_val.add('y')
    agg_var_val = {key:'mean' for key in variables_val}

    df_train = df_res_train.resample(t).agg(agg_var_train)
    df_val = df_res_val.resample(t).agg(agg_var_val)
    dico_errors[t] = get_metrics_final(df_train,df_val,dico_metrics,dico_models)
  return dico_errors"""

In [None]:
def max_error_normalized(row, model): 
      return (abs(row['pred_' + model] - row['y'])/row['y'])
    
def indicatrice(row, model):
  if row['y'] <= row['y_max_' + model] and row['y'] >= row['y_min_' + model]:
    return 1
  else:
    return 0

In [None]:
def get_metrics_final(df_res_train, df_res_val,dico_metrics, list_models):
  y_train = df_res_train.y
  y_val = df_res_val.y
  df_errors = pd.DataFrame(columns = list_models)
  dico_valeurs = {}
    
  for model in list_models:
    
    y_train_pred = df_res_train.loc[:,'pred_' +model]
    y_val_pred = df_res_val.loc[:,'pred_' +model]

    dico_errors ={}
    
      
    for metric in dico_metrics:  
      
      metric_fun = dico_metrics[metric]
      
      if metric == 'max_error':
        dico_errors[metric+'_train'] = max(df_res_train.apply(lambda row : max_error_normalized(row, model), axis=1))
        dico_errors[metric+'_val'] = max(df_res_val.apply(lambda row : max_error_normalized(row, model), axis=1))
        
      else: 
        dico_errors[metric+'_train'] = metric_fun(y_train,y_train_pred)
        dico_errors[metric+'_val'] = metric_fun(y_val,y_val_pred)
    
    if model not in ['TBATS', 'blending']:
      dico_errors['IC_ind_val'] = sum(df_res_val.apply(lambda row : indicatrice(row, model), axis=1))*100/len(df_res_val)
    else : 
      dico_errors['IC_ind_val'] = 0
      
    dico_valeurs[model] = dico_errors
  df_errors = pd.DataFrame(dico_valeurs)
  return df_errors

##Selection meilleur modèle

In [None]:
def rank_values(df): 
  dic_met = {}
  for x in range(len(df)): 
    rank = []
    list_sort = list(np.sort(list(df.iloc[x,:])))
    if df.index[x] == 'IC_ind_train' or df.index[x] == 'IC_ind_val':
      list_sort.reverse()
    for y in range(len(list_sort)): 
      rank.append(list_sort.index(df.iloc[x,y]))
    dic_met[df.index[x]] = rank      
  dict_met = pd.DataFrame(dic_met).T
  dict_met.columns = df.columns
  return dict_met

In [None]:
def model_selection(df_metrique_val): 
  df_rank = rank_values(df_metrique_val)
  classement = []
  for x in range(df_rank.shape[1]):
    points = 1/3*df_rank.iloc[list(df_rank.index).index('mae_val'), x] + 1/3*df_rank.iloc[list(df_rank.index).index('max_error_val'), x] + 1/3*df_rank.iloc[list(df_rank.index).index('IC_ind_val'), x]
    classement.append(points)
  idx = classement.index(min(classement))
  modele = list(df_rank.columns)[idx]
  return(modele)

In [None]:
def best_model(df_res_train, df_res_val, df_res_test, model, client): 
  choix = [x for x in df_res_train.columns if model in x] + ['y']
  train = df_res_train[choix]
  choix = [x for x in df_res_val.columns if model in x] + ['y']
  val = df_res_val[choix]
  choix = [x for x in df_res_test.columns if model in x] 
  test = df_res_test[choix]
  plot_ts_df(train, val, test,client)
  return(train, val, test)

##Transformation df mensuel/annuel

In [None]:
def construct_df_tempo(timeseries, df_, tempo='m'):
  """Prend en entrée la ts initiale et df et renvoie le df mens ou annuel selon temporalité"""
  
  ts = timeseries.copy()
  df = df_.copy()
  
  ts.index = ts.DATE
  df_join = ts.loc[ts.index.isin(df.index.to_timestamp().to_list())]
  df.index = df_join.index
  df_complet = pd.concat([df_join, df], axis=1).reset_index(drop=True)

  for x in [x for x in df_complet.columns if x not in  ['NB_ACTIFS', 'DATE', 'NB_INACTIFS']]:
    df_complet[x] = df_complet[x]*df_complet['NB_ACTIFS']

  if tempo == "m": 
    df_complet['DATE'] = pd.to_datetime(df_complet['DATE']).dt.strftime('%m-%Y')
  elif tempo == "y": 
    df_complet['DATE'] = pd.to_datetime(df_complet['DATE']).dt.strftime('%Y')
  elif tempo == "w": 
    df_complet['DATE'] = pd.to_datetime(df_complet['DATE']).dt.strftime('%Y-%W')

  df_complet = df_complet.drop(columns=['TX_ABS', 'y']).groupby(['DATE']).sum()

  for x in [x for x in df_complet.columns if x not in ['DATE', 'NB_ACTIFS', 'NB_INACTIFS']]: 
    df_complet[x] = df_complet[x] / df_complet['NB_ACTIFS']

  df_complet['y'] = df_complet['NB_INACTIFS'] / df_complet['NB_ACTIFS']
  df_complet = df_complet.drop(columns=['NB_INACTIFS', 'NB_ACTIFS'])
  
  return(df_complet)

## Assembling

In [None]:
def n_best_model(df, n): 
  classement = []
  for x in range(df.shape[1]):
    points = 1/3*df.iloc[list(df.index).index('mae_val'), x] + 1/3*df.iloc[list(df.index).index('max_error_val'), x] + 1/3*df.iloc[list(df.index).index('IC_ind_val'), x]
    classement.append(points)
  classement_trie = sorted(classement)
  best_modeles = []
  for i in range(n):
    best_modeles.append(list(df.columns)[classement.index(classement_trie[i])])
  return(best_modeles)

In [None]:
def assembling_models(timeseries, train, val, test, dico_pred, dico_metrics, dico_models):

  ts,_,_ = preprocess_data(timeseries)
  y = ts[['y']]
  y_train, y_val = temporal_train_test_split(y, test_size = dico_pred['val_size'])
  X_idx = pd.period_range(start=train.index.min(),end=test.index.max(), freq="d")
  train_idx = pd.period_range(start=train.index.min(),end=train.index.max(), freq="d")
  val_idx = pd.period_range(start=val.index.min(),end=val.index.max(), freq="d")
  test_idx = pd.period_range(start=test.index.min(),end=test.index.max(), freq="d")
  
  X = create_features(X_idx)
  X_train = X[train.index.min(): train.index.max()]
  X_val = X[val.index.min():val.index.max()]
  X_trainval = X[train.index.min(): val.index.max()]
  X_test = X[test.index.min(): test.index.max()]
  
  df_train = train.copy()
  df_val = val.copy()
  df_train_mens = construct_df_tempo(timeseries, df_train, 'm')
  df_val_mens = construct_df_tempo(timeseries, df_val, 'm')
  metrics_mens = get_metrics_final(df_train_mens,df_val_mens,dico_metrics,list(dico_models.keys()))
  df = rank_values(metrics_mens)
  mae=[]
  
  for n_models in range(1,len(dico_models)+1):
    list_models = n_best_model(df, n_models)
    col = ['pred_' + x for x in list_models] 
    X_train = pd.concat([X_train, train[[x for x in col]]], axis=1)
    X_val = pd.concat([X_val, val[[x for x in col]]], axis=1)
  

    #Evaluation sur la validation
    model = LinearRegression(fit_intercept=True)
    model.fit(X_train, y_train['y']) #y_train est un df
    y_val_pred = pd.DataFrame({"pred_blending": model.predict(X_val)})
    y_val_pred.index = y_val.index
    y_train_pred =  pd.DataFrame({"pred_blending": model.predict(X_train)})
    y_train_pred.index = y_train.index
    error = mean_absolute_error(y_val['y'], y_val_pred)
    mae.append(error)
    if error == min(mae):
      val_ = y_val_pred
      train_ = y_train_pred
      nb_models = n_models
  
  list_models = n_best_model(df, nb_models)
  col = ['pred_' + x for x in list_models] 
  X_trainval = pd.concat([X_trainval, pd.concat([train[[x for x in col]], val[[x for x in col]]], axis=0)], axis=1)
  X_test = pd.concat([X_test, test[[x for x in col]]], axis=1)

  model = LinearRegression(fit_intercept=True)
  model.fit(X_trainval, y['y'])

  test_ = pd.DataFrame({"pred_blending": model.predict(X_test)})
  test_.index = test_idx
   
  return(train_, val_, test_)

## Transformation df pour PowerBI

In [None]:
def transform_final_2(df):

  tf = df[df['reel'].isna()==False].copy()
  tf['type']='reel'
  tf['y'] = tf.reel
  tf = tf.drop(columns=['reel', 'pred','y_min','y_max'])

  tf_na_min = df[df['y_min'].isna()==False].copy()
  tf_na_min['type']='y_min'
  tf_na_min['y']=tf_na_min.y_min
  tf_na_min = tf_na_min.drop(columns=['reel', 'pred','y_min','y_max'])

  tf_na_max = df[df['y_max'].isna()==False].copy()
  tf_na_max['type']='y_max'
  tf_na_max['y']=tf_na_max.y_max
  tf_na_max = tf_na_max.drop(columns=['reel', 'pred','y_min','y_max'])

  tf_na_pred = df[df['pred'].isna()==False].copy()
  tf_na_pred['type']='pred'
  tf_na_pred['y']=tf_na_pred.pred
  tf_na_pred = tf_na_pred.drop(columns=['reel', 'pred','y_min','y_max'])

  tf = pd.concat([tf,tf_na_min,tf_na_max,tf_na_pred])
  return(tf)

#Fonction boucle finale

In [None]:
def df_models(timeseries, dico_pred, dico_models):
  
  """ Prend en entrée la série temporelle, des paramètres de prédiction et une liste de modèle et retourne 1 dicionnaires des meilleurs paramètres pour chaque modèle,
  et trois dataframes (train, val, test) des prédictions concaténés (avec IC)"""
  
  ts,_,_ = preprocess_data(timeseries)
  y = ts[['y']]
  y_train, y_val = temporal_train_test_split(y, test_size = dico_pred['val_size'])
  #horizons de prédiction
  fh_train = ForecastingHorizon(y_train.index, is_relative=False)
  fh_val = ForecastingHorizon(y_val.index, is_relative=False)
  deb_date_test = y_val.index[-1] + timedelta(days=1)
  fh_test = ForecastingHorizon(pd.period_range(start=deb_date_test, periods=dico_pred['steps'], freq="d"), is_relative=False)
  
  df_train = y_train.copy()
  df_val = y_val.copy()
  df_test = pd.DataFrame({'DATE' : pd.period_range(start=deb_date_test, periods=dico_pred['steps'], freq="d")})
  df_test.index = df_test.DATE
  
  dict_bp = {}
  #boucle sur tous les modèles 
  for model_name in dico_models.keys() : 

    param = dico_params[model_name]
    model = dico_models[model_name]
         
    if model_name == 'Prophet': 
      best_params, train, val, test = model_prophet_opti(y, y_train, y_val, dico_pred)
      
    elif model_name == 'RL': 
      best_params, train, val, test = model_rl(y, y_train, y_val, dico_pred)
    
    elif model_name =='SARIMAX':
      best_params, train, val, test = model_sarimax(timeseries, val_size, dico_pred['steps'])
      train = pd.DataFrame(index=y_train.index).merge(train, how='left', right_index=True, left_index=True)
      train = train.fillna(method='backfill')
      train = train.fillna(method='ffill')
      val = pd.DataFrame(index=y_val.index).merge(val, how='left', right_index=True, left_index=True)
      val = val.fillna(method='backfill')
      val = val.fillna(method='ffill')
      test = pd.DataFrame(index=df_test.index).merge(test, how='left', right_index=True, left_index=True)
      test = test.fillna(method='backfill')
      test = test.fillna(method='ffill')
  
    else:
      try: 
        cv = ExpandingWindowSplitter(initial_window = 750,step_length = int((len(y)-(840))/3), fh = 90)
        gscv = ForecastingGridSearchCV(model, cv=cv, param_grid=param, verbose=True, scoring = MeanSquaredError(square_root=True), n_jobs=-1)
        gscv_train = gscv.fit(y)
        best_params = gscv_train.best_params_
      except : 
        cv = ExpandingWindowSplitter(initial_window = 750,step_length = int((len(y)-(840))/3), fh = 90)
        gscv = ForecastingGridSearchCV(model, cv=cv, param_grid=param, verbose=True, scoring = MeanSquaredError(square_root=True))
        gscv_train = gscv.fit(y)
        best_params = gscv_train.best_params_
      
      #Fit du modèle sur le train et récupération des fitted_values et de y_val
      forecaster = model
      train_model = forecaster.set_params(**best_params).fit(y_train) 
      train = train_model.predict(fh_train).rename(columns={'y' : 'pred'})
      val = train_model.predict(fh_val).rename(columns={'y' : 'pred'})
    
      #Fit sur ts entière et récupération de y_test
      forecaster = model
      final_model = forecaster.set_params(**best_params).fit(y)
      test = final_model.predict(fh_test).rename(columns={'y' : 'pred'})

      #Création des IC, sigma_t = std*(1+alpha*sqrt(t - t_final))
      #Attention : si les prédiction de base sont pourries sur le val, l'IC peut-être très grand, c'est notamment le cas lorsque le val commence le 1er janvier mais sinon dans tous les autres cas il est fiable
    
    coverage = 0.90 #taille de l'IC
    std = mean_squared_error(y_val['y'], val['pred'], square_root=True)
    val['time'] = np.arange(len(val))
    test['time'] = np.arange(len(test))
    alpha = ((((np.abs(y_val['y'] - val['pred'])/std) - 1)) / (np.sqrt(val['time']))).quantile(coverage) #plus alpha est grand, plus l'IC sera large

    def pred_min(row):
      return row['pred'] - std*(1+alpha*np.sqrt(row['time']))
    def pred_max(row):
      return row['pred'] + std*(1+alpha*np.sqrt(row['time']))
    
    test['y_min'] = test.apply(pred_min, axis=1)
    test['y_max'] = test.apply(pred_max, axis=1)
    val['y_min'] = val.apply(pred_min, axis=1)
    val['y_max'] = val.apply(pred_max, axis=1)

    val = val.drop('time', axis=1)
    test = test.drop('time', axis=1)
    
    for x in [train, val, test]: 
      x.columns = [str(col) + '_' + model_name for col in x.columns]

    dict_bp[model_name] = best_params
    
    df_train = pd.concat([train, df_train], axis=1)
    df_val = pd.concat([val, df_val], axis=1)
    df_test = pd.concat([test, df_test], axis=1)
    
  df_test = df_test.drop(columns=['DATE'])
  
  return (dict_bp, df_train, df_val, df_test) 

In [None]:
"""      try : 
        cv = ExpandingWindowSplitter(initial_window = 750,step_length = int((len(y_train)-870)/1), fh = 120)
        gscv = ForecastingGridSearchCV(model, cv=cv, param_grid=param, verbose=True, scoring = MeanSquaredError(square_root=True), n_jobs=-1)
        gscv_train = gscv.fit(y_train)
        best_params = gscv_train.best_params_
        pass
      except : 
        try : 
          cv = ExpandingWindowSplitter(initial_window = 750,step_length = int((len(y)-(870))/4), fh = 120)
          gscv = ForecastingGridSearchCV(model, cv=cv, param_grid=param, verbose=True, scoring = MeanSquaredError(square_root=True), n_jobs=-1)
          gscv_train = gscv.fit(y)
          best_params = gscv_train.best_params_
          pass
        except : 
          try : 
            cv = ExpandingWindowSplitter(initial_window = 750,step_length =50000, fh = 120)
            gscv = ForecastingGridSearchCV(model, cv=cv, param_grid=param, verbose=True, scoring = MeanSquaredError(square_root=True), n_jobs=-1)
            gscv_train = gscv.fit(y, fh=fh_train)
            best_params = gscv_train.best_params_
            pass
          except:
            best_params = {}
            print('best_params vides')
            pass"""