# TIME SERIES APPROCHE MACHINE LEARNING

## Objets perdus uniquement (pas de features extérieures)

### Daily model

In [68]:
import pandas as pd
import  numpy as np

In [69]:
first_data = pd.read_csv('../data_cleaned.csv')

In [70]:
first_data.drop(['restitution_date','nom', 'gare', 'type', 'nature'], axis=1, inplace=True)

In [71]:
first_data = first_data.loc[first_data["year"]<=2022]

In [72]:
data_grouped = first_data.groupby(['date', 'year', 'month', 'week', 'day', 'day_of_week']).sum().reset_index()

In [73]:
# Obtention des objets perdus des dates antérieures

In [74]:
import datetime

In [75]:
data_grouped["date"] = pd.to_datetime(data_grouped["date"])

In [76]:
def create_t_x(data):
    last_data = data.copy()
    for i in range(1,4):
        data_t = data.copy()
        data_t["date"] = data_t['date'].apply(lambda x: x + datetime.timedelta(days=i)) 
        data_t.drop(["year", "month", "day","week", "day_of_week"], axis=1, inplace=True)
        data_t.rename(columns={"number": f"T-{i}"}, inplace=True)
        last_data = last_data.merge(data_t, how='left', on='date')
    return last_data

In [77]:
# Jointure

In [78]:
last_df = create_t_x(data_grouped)

In [79]:
last_df.head(5)

Unnamed: 0,date,year,month,week,day,day_of_week,number,T-1,T-2,T-3
0,2016-01-01,2016,1,53,1,Vendredi,1,,,
1,2016-01-02,2016,1,53,2,Samedi,8,1.0,,
2,2016-01-03,2016,1,53,3,Dimanche,8,8.0,1.0,
3,2016-01-04,2016,1,1,4,Lundi,12,8.0,8.0,1.0
4,2016-01-05,2016,1,1,5,Mardi,4,12.0,8.0,8.0


In [80]:
# Splitting the datas

In [81]:
from sklearn.model_selection import TimeSeriesSplit

In [82]:
splits = TimeSeriesSplit(n_splits=5, gap=3)

In [83]:
# Model XGBOOST

In [84]:
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [85]:
int_columns = ["year", "T-1", "T-2", "T-3"]
categoricals_columns = ["month", "week", "day_of_week"]

int_transformer = MinMaxScaler()
categoricals_transformers = OneHotEncoder(handle_unknown='ignore')

transformers = make_column_transformer((int_transformer, int_columns), (categoricals_transformers, categoricals_columns))

In [86]:
# Evaluation of the model

In [87]:
def evaluate(model, X, y, cv):
    pipeline = make_pipeline(transformers, model)
    cv_results = cross_validate(
        pipeline,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    return mae.mean(), rmse.mean()

In [88]:
model = XGBRegressor(
    n_estimators = 10000,
    max_depth=16,
    max_leaves=0,
    learning_rate=0.0001,
    random_state=1
)
mae, rmse = evaluate(model, last_df.drop(["number"], axis=1), last_df["number"], cv=splits)

Mean Absolute Error:     4.331 +/- 1.169
Root Mean Squared Error: 5.622 +/- 1.400


In [89]:
# Fit du modèle

In [90]:
pipeline = make_pipeline(transformers, model)
pipeline.fit(last_df.drop(["number"], axis=1), last_df["number"])

In [91]:
# Sauvegarde du modèle

In [92]:
import pickle

In [94]:
with open('xgboost_daily_score.txt', 'r') as f:
    value = f.readline()
    liste_score = [float(x) for x in value.split(', ')]
    if mae < liste_score[0] and rmse < liste_score[1]:
        print('sauvegarde')
        with open('xgboost_daily_score.txt', 'w') as f2:
            f2.write(f'{mae}, {rmse}')
        with open('xgboost_daily.pkl', "wb") as m:
            pickle.dump(model, m)

In [95]:
# Prediction

In [96]:
def prepare_data(init, data, number_of_days):
    if init:
        data = pd.read_csv('../data_cleaned.csv')
        data = data.loc[data["year"]<=2022]
        data.drop(["gare", "restitution_date", "nature", "type", "nom"], axis=1, inplace=True)
        data["date"] = pd.to_datetime(data["date"])
        min_date = data["date"].max() + datetime.timedelta(days=1)
        max_date = data["date"].max() + datetime.timedelta(days=number_of_days)
        
        new_df = pd.DataFrame()
        new_df["date"] = pd.date_range(min_date, max_date)
        new_df["year"] = new_df["date"].dt.year
        new_df["month"] = new_df["date"].dt.month
        new_df["week"] = new_df["date"].dt.isocalendar().week
        new_df["day"] = new_df["date"].dt.day
        new_df["number"] = np.nan
        new_df["day_of_week"] = new_df["date"].dt.day_of_week.map({0:"Lundi", 1: "Mardi", 2: "Mercredi",
                                                                    3: "Jeudi", 4: "Vendredi",
                                                                    5: "Samedi", 6: "Dimanche"})
        data = pd.concat([data,new_df], ignore_index=True)
        data = data.groupby(["date", "year", "month", "week", "day_of_week", "day"]).sum().reset_index()
    else:
        data.drop(["T-1", "T-2", "T-3"], axis=1, inplace=True)
    initial_data = create_t_x(data)
    return initial_data

In [97]:
import math

In [98]:
def make_a_prediction(number_of_days):
    with open('xgboost.pkl', "rb") as f:
        model=pickle.load(f)
    
    for i in range(number_of_days):
        if i==0: 
            data = prepare_data(init=True, data=None, number_of_days=number_of_days)
        else:
            data = prepare_data(init=False, data=data, number_of_days=number_of_days)
        number = math.floor(model.predict(data.iloc[[-number_of_days+i]]))
        data["number"].iloc[[-number_of_days+i]] = number
    
    return data

In [100]:
data = make_a_prediction(7)

  new_df["week"] = new_df["date"].dt.week
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["number"].iloc[[-number_of_days+i]] = number
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["number"].iloc[[-number_of_days+i]] = number
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["number"].iloc[[-number_of_days+i]] = number
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

In [101]:
data

Unnamed: 0,date,year,month,week,day_of_week,day,number,T-1,T-2,T-3
0,2016-01-01,2016,1,53,Vendredi,1,1.0,,,
1,2016-01-02,2016,1,53,Samedi,2,8.0,1.0,,
2,2016-01-03,2016,1,53,Dimanche,3,8.0,8.0,1.0,
3,2016-01-04,2016,1,1,Lundi,4,12.0,8.0,8.0,1.0
4,2016-01-05,2016,1,1,Mardi,5,4.0,12.0,8.0,8.0
...,...,...,...,...,...,...,...,...,...,...
2449,2023-01-03,2023,1,1,Mardi,3,3.0,7.0,0.0,5.0
2450,2023-01-04,2023,1,1,Mercredi,4,3.0,3.0,7.0,0.0
2451,2023-01-05,2023,1,1,Jeudi,5,3.0,3.0,3.0,7.0
2452,2023-01-06,2023,1,1,Vendredi,6,3.0,3.0,3.0,3.0


### Weekly model

In [137]:
week_data = pd.read_csv('../data_cleaned.csv')

In [138]:
week_data.drop(['restitution_date','nom', 'gare', 'type', 'nature', 'day', 'month'], axis=1, inplace=True)
week_data = week_data.loc[week_data["year"]<=2022]

In [139]:
week_data = week_data.groupby(["year", "week"]).sum("number").reset_index()

In [140]:
week_data

Unnamed: 0,year,week,number
0,2016,1,60
1,2016,2,76
2,2016,3,77
3,2016,4,80
4,2016,5,74
...,...,...,...
355,2022,48,57
356,2022,49,65
357,2022,50,45
358,2022,51,45


In [141]:
def create_week_t_x(data):
    last_data = data.copy()
    for i in range(1,4):
        data_copy = data.copy()
        data_copy["week"] = data_copy["week"].apply(lambda x: x+i if x+i<=52 else 0+(x+i))
        data_copy.rename(columns={"number": f"T-{i}"}, inplace=True)
        last_data = last_data.merge(data_copy, how='left', on=['year', 'week'])
    return last_data

In [142]:
week_data = create_week_t_x(week_data)

In [143]:
week_data

Unnamed: 0,year,week,number,T-1,T-2,T-3
0,2016,1,60,,,
1,2016,2,76,60.0,,
2,2016,3,77,76.0,60.0,
3,2016,4,80,77.0,76.0,60.0
4,2016,5,74,80.0,77.0,76.0
...,...,...,...,...,...,...
355,2022,48,57,62.0,52.0,53.0
356,2022,49,65,57.0,62.0,52.0
357,2022,50,45,65.0,57.0,62.0
358,2022,51,45,45.0,65.0,57.0


In [144]:
int_columns = ["year", "T-1", "T-2", "T-3"]
categoricals_columns = ["week"]

int_transformer = MinMaxScaler()
categoricals_transformers = OneHotEncoder(handle_unknown='ignore')

transformers = make_column_transformer((int_transformer, int_columns), (categoricals_transformers, categoricals_columns))

In [145]:
mae, rmse = evaluate(model, week_data.drop("number", axis=1), week_data["number"], cv=splits)

Mean Absolute Error:     22.600 +/- 9.468
Root Mean Squared Error: 26.068 +/- 9.797


In [146]:
pipeline = make_pipeline(transformers, model)
pipeline.fit(week_data.drop("number", axis=1), week_data["number"])

In [153]:
with open('xgboost_weekly_score.txt', 'r') as f:
    value = f.readline()
    liste_score = [float(x) for x in value.split(', ')]
    if mae < liste_score[0] and rmse < liste_score[1]:
        print('sauvegarde')
        with open('xgboost_weekly_score.txt', 'w') as f2:
            f2.write(f'{mae}, {rmse}')
        with open('xgboost_weekly.pkl', "wb") as m:
            pickle.dump(pipeline, m)

### Monthly model

In [185]:
monthly_data = pd.read_csv('../data_cleaned.csv')
monthly_data.drop(['restitution_date','nom', 'gare', 'type', 'nature', 'day','day_of_week', 'week'], axis=1, inplace=True)
monthly_data = monthly_data.loc[monthly_data["year"]<=2022]

In [186]:
monthly_data = monthly_data.groupby(["year", "month"]).sum("number").reset_index()

In [187]:
monthly_data

Unnamed: 0,year,month,number
0,2016,1,310
1,2016,2,291
2,2016,3,321
3,2016,4,342
4,2016,5,321
...,...,...,...
78,2022,8,195
79,2022,9,251
80,2022,10,267
81,2022,11,228


In [188]:
def create_month_t_x(data):
    last_data = data.copy()
    for i in range(1,4):
        data_copy = data.copy()
        data_copy["month"] = data_copy["month"].apply(lambda x: x+i)
        data_copy["year"].loc[data_copy["month"]>12] += 1
        data_copy["month"] = data_copy["month"].apply(lambda x: x-12 if x>12 else x)
        data_copy.rename(columns={"number": f"T-{i}"}, inplace=True)
        last_data = last_data.merge(data_copy, how='left', on=['year', 'month'])
    return last_data

In [189]:
monthly_data = create_month_t_x(monthly_data)

In [190]:
monthly_data

Unnamed: 0,year,month,number,T-1,T-2,T-3
0,2016,1,310,,,
1,2016,2,291,310.0,,
2,2016,3,321,291.0,310.0,
3,2016,4,342,321.0,291.0,310.0
4,2016,5,321,342.0,321.0,291.0
...,...,...,...,...,...,...
78,2022,8,195,240.0,255.0,251.0
79,2022,9,251,195.0,240.0,255.0
80,2022,10,267,251.0,195.0,240.0
81,2022,11,228,267.0,251.0,195.0


In [191]:
int_columns = ["year", "T-1", "T-2", "T-3"]
categoricals_columns = ["month"]

int_transformer = MinMaxScaler()
categoricals_transformers = OneHotEncoder(handle_unknown='ignore')

transformers = make_column_transformer((int_transformer, int_columns), (categoricals_transformers, categoricals_columns))

In [192]:
mae, rmse = evaluate(model, monthly_data.drop("number", axis=1), monthly_data["number"], cv=splits)

Mean Absolute Error:     105.375 +/- 46.558
Root Mean Squared Error: 117.513 +/- 41.383


In [194]:
pipeline = make_pipeline(transformers, model)
pipeline.fit(monthly_data.drop("number", axis=1), monthly_data["number"])

In [None]:
with open('xgboost_monthly_score.txt', 'r') as f:
    value = f.readline()
    liste_score = [float(x) for x in value.split(', ')]
    if mae < liste_score[0] and rmse < liste_score[1]:
        print('sauvegarde')
        with open('xgboost_monthly_score.txt', 'w') as f2:
            f2.write(f'{mae}, {rmse}')
        with open('xgboost_monthly.pkl', "wb") as m:
            pickle.dump(pipeline, m)