In [6]:
import pandas as pd
import pickle
import yaml

config_path = 'config/retrain_config.yml'
data_path = 'data/featurized_dataset.csv'
model_path = 'model/lstm_model.pkl'
last_retrained_date_path = 'savedata/last_retrained_date.pkl'


# load parameters from config
with open(config_path) as config_file:
    config_contents = yaml.safe_load(config_file)
    # minimal size of new data (in days) sufficient to retrain the model 
    retraining_window_days = config_contents['retraining_window_days']
    # name of the date index column in the featurized dataset
    date_index_column_name = config_contents['date_index_column_name']
    # name of the target column in the featurized dataset
    target_column_name = config_contents['target_column_name']

# Load the featurized dataset
data = pd.read_csv(data_path, index_col=date_index_column_name)

# Load the model
with open(model_path, 'rb') as file:
    model = pickle.load(file)

# assume that data is delivered daily
# when the model is retrained it is done up to the last day 
# therefore all new data will begin after the last retrain date

# Load the last retrained date
last_retrained_date = None
try:
    with open(last_retrained_date_path, 'rb') as file:
        last_retrained_date = pickle.load(file)
except FileNotFoundError:
    pass

if last_retrained_date is None:
    # This is the first time, so start from the first date
    analysis_start_date = data.index.min()
else:
    # Select new data starting from the day after the last retrained date
    analysis_start_date = last_retrained_date + pd.DateOffset(days=1)

# split data into already trained and new batch
new_data = data[data.index >= analysis_start_date]
old_data = data[data.index < analysis_start_date]

# Check if there is enough new data for retraining
if len(new_data.index) >= retraining_window_days:

    # Get the latest window of data for retraining
    retrain_data = new_data[-retraining_window_days:]

    # TODO: add split into train, validation, oot
    # TODO: assess old model preformance on new data
    
    # Retrain the model using .partial_fit on new data
    model.partial_fit(X=retrain_data.drop(columns=target_column_name), y=retrain_data[target_column_name])  

    # TODO: notify about feature stability, new model performnce on validation/oot
    

    # Update the trained model pickle file
    with open(model_path, 'wb') as file:
        pickle.dump(model, file)

    new_last_retrained_date = new_data.index.max()
    # Update last trained date pickle file
    with open(last_retrained_date_path, 'wb') as file:
        pickle.dump(new_last_retrained_date, file)
else:
    print("Not enough new data for retraining.")

7
