# TIME SERIES APPROCHE MACHINE LEARNING

## Objets perdus uniquement (pas de features extérieures)

In [506]:
import pandas as pd
import  numpy as np

In [507]:
first_data = pd.read_csv('../data_cleaned.csv')

In [508]:
first_data.drop(['restitution_date','nom', 'gare', 'type', 'nature'], axis=1, inplace=True)

In [509]:
first_data = first_data.loc[first_data["year"]<=2022]

In [510]:
data_grouped = first_data.groupby(['date', 'year', 'month', 'week', 'day', 'day_of_week']).sum().reset_index()

In [511]:
# Obtention des objets perdus des dates antérieures

In [512]:
import datetime

In [513]:
data_grouped["date"] = pd.to_datetime(data_grouped["date"])

In [514]:
def create_t_x(data):
    last_data = data.copy()
    for i in range(1,4):
        data_t = data.copy()
        data_t["date"] = data_t['date'].apply(lambda x: x + datetime.timedelta(days=i)) 
        data_t.drop(["year", "month", "day","week", "day_of_week"], axis=1, inplace=True)
        data_t.rename(columns={"number": f"T-{i}"}, inplace=True)
        last_data = last_data.merge(data_t, how='left', on='date')
    return last_data

In [515]:
# Jointure

In [516]:
last_df = create_t_x(data_grouped)

In [517]:
last_df.head(5)

Unnamed: 0,date,year,month,week,day,day_of_week,number,T-1,T-2,T-3
0,2016-01-01,2016,1,53,1,Vendredi,1,,,
1,2016-01-02,2016,1,53,2,Samedi,8,1.0,,
2,2016-01-03,2016,1,53,3,Dimanche,8,8.0,1.0,
3,2016-01-04,2016,1,1,4,Lundi,12,8.0,8.0,1.0
4,2016-01-05,2016,1,1,5,Mardi,4,12.0,8.0,8.0


In [518]:
# Splitting the datas

In [519]:
from sklearn.model_selection import TimeSeriesSplit

In [520]:
splits = TimeSeriesSplit(n_splits=5, gap=3)

In [521]:
# Model XGBOOST

In [522]:
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [523]:
int_columns = ["year", "T-1", "T-2", "T-3"]
categoricals_columns = ["month", "week", "day_of_week"]

int_transformer = MinMaxScaler()
categoricals_transformers = OneHotEncoder(handle_unknown='ignore')

transformers = make_column_transformer((int_transformer, int_columns), (categoricals_transformers, categoricals_columns))

In [524]:
# Evaluation of the model

In [525]:
def evaluate(model, X, y, cv):
    pipeline = make_pipeline(transformers, model)
    cv_results = cross_validate(
        pipeline,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    return mae.mean(), rmse.mean()

In [526]:
model = XGBRegressor(
    n_estimators = 10000,
    max_depth=16,
    max_leaves=0,
    learning_rate=0.0001,
    random_state=1
)
mae, rmse = evaluate(model, last_df.drop(["number"], axis=1), last_df["number"], cv=splits)

KeyboardInterrupt: 

In [None]:
# Sauvegarde du modèle

In [None]:
import pickle

In [None]:
with open('xgboost_score.txt', 'r') as f:
    value = f.readline()
    liste_score = [float(x) for x in value.split(', ')]
    if mae < liste_score[0] and rmse < liste_score[1]:
        print('sauvegarde')
        with open('xgboost_score.txt', 'w') as f2:
            f2.write(f'{mae}, {rmse}')
        with open('xgboost.pkl', "wb") as m:
            pickle.dump(model, m)

In [None]:
# Prediction

In [527]:
def prepare_data(init, data):
    if init:
        data = pd.read_csv('../data_cleaned.csv')
        data = data.loc[data["year"]<=2022]
        data.drop(["gare", "restitution_date", "nature", "type", "nom"], axis=1, inplace=True)
        data["date"] = pd.to_datetime(data["date"])
        min_date = data["date"].max() + datetime.timedelta(days=1)
        max_date = data["date"].max() + datetime.timedelta(days=7)
        
        new_df = pd.DataFrame()
        new_df["date"] = pd.date_range(min_date, max_date)
        new_df["year"] = new_df["date"].dt.year
        new_df["month"] = new_df["date"].dt.month
        new_df["week"] = new_df["date"].dt.week
        new_df["day"] = new_df["date"].dt.day
        new_df["number"] = np.nan
        new_df["day_of_week"] = new_df["date"].dt.day_of_week.map({0:"Lundi", 1: "Mardi", 2: "Mercredi",
                                                                    3: "Jeudi", 4: "Vendredi",
                                                                    5: "Samedi", 6: "Dimanche"})
        data = pd.concat([data,new_df], ignore_index=True)
        data = data.groupby(["date", "year", "month", "week", "day_of_week", "day"]).sum().reset_index()
    else:
        data.drop(["T-1", "T-2", "T-3"], axis=1, inplace=True)
    initial_data = create_t_x(data)
    return initial_data

In [531]:
def make_a_prediction(date):
    with open('xgboost.pkl', "rb") as f:
        model=pickle.load(f)
    
    for i in range(7):
        if i==0: 
            data = prepare_data(init=True, data=None)
        else:
            data = prepare_data(data)
    # for i in range(len(new_df))
    
    return data

In [532]:
data = make_a_prediction("")

  new_df["week"] = new_df["date"].dt.week


In [533]:
data

Unnamed: 0,date,year,month,week,day_of_week,day,number,T-1,T-2,T-3
0,2016-01-01,2016,1,53,Vendredi,1,1.0,,,
1,2016-01-02,2016,1,53,Samedi,2,8.0,1.0,,
2,2016-01-03,2016,1,53,Dimanche,3,8.0,8.0,1.0,
3,2016-01-04,2016,1,1,Lundi,4,12.0,8.0,8.0,1.0
4,2016-01-05,2016,1,1,Mardi,5,4.0,12.0,8.0,8.0
...,...,...,...,...,...,...,...,...,...,...
2449,2023-01-03,2023,1,1,Mardi,3,0.0,0.0,0.0,5.0
2450,2023-01-04,2023,1,1,Mercredi,4,0.0,0.0,0.0,0.0
2451,2023-01-05,2023,1,1,Jeudi,5,0.0,0.0,0.0,0.0
2452,2023-01-06,2023,1,1,Vendredi,6,0.0,0.0,0.0,0.0
