### Code to prepare the dataset for fine-tuning in jsonl format

### Code Description

This code prepares the datasets for fine-tuning the TimeMOE model. For each option, a specific dataset will be generated with a different configuration. The output format is JSON Lines (jsonl), which is ideal for training models with large datasets while keeping the data structure lightweight and easy to process.

Each dataset configuration can be adjusted according to the task type and defined parameters. The preparation includes organizing the data in a format compatible with model training, such as TimeMOE.

### Imports

In [11]:
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os

In [2]:
def rolling_window(series, window):
    """
    Generate rolling window data for time series analysis.

    Parameters:
    - series: array-like, time series data
    - window: int, size of the rolling window

    Returns:
    - df: pandas DataFrame, containing the rolling window data
    - scaler: MinMaxScaler object, used for normalization
    """
    data = []

    for i in range(len(series) - window):
        example = np.array(series[i:i + window])
        data.append(example)

    df = pd.DataFrame(data)
    df = df.dropna()
    return df

### Fine Tuning Global 5 Years

In [6]:
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

all_data['timestamp'] = pd.to_datetime(all_data['timestamp'], errors='coerce')

output_path = 'dataset_global_5_years/'  
os.makedirs(output_path, exist_ok=True)  

end_date = all_data['timestamp'].max()

for years in range(5, 0, -1):
    yearly_data = all_data[all_data['timestamp'] <= end_date - pd.DateOffset(years=(5-years))]
    state_product_dict = {}

    for state in yearly_data['state'].unique():
        products = yearly_data[yearly_data['state'] == state]['product'].unique()
        state_product_dict[state] = list(products)

    output_file = os.path.join(output_path, f'dataset_global_{(end_date - pd.DateOffset(years=(5-years))).year}.jsonl')

    with open(output_file, 'w') as file:
        for state, products in state_product_dict.items():
            for product in products:
                data_filtered = yearly_data[(yearly_data['state'] == state) & (yearly_data['product'] == product)]

                sequence = data_filtered['m3'][:-12].tolist()

                if sequence: 
                    json_line = {
                        f'sequence': sequence
                    }
                    file.write(json.dumps(json_line) + '\n')

print("Data has been successfully saved by year.")

### Fine Tuning Global

In [18]:
# NOTE: Uncomment and comment out the code below as needed.

# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

state_product_dict = {}

for state in all_data['state'].unique():
    products = all_data[all_data['state'] == state]['product'].unique()
    state_product_dict[state] = list(products)

output_file = 'dataset_global/dataset_global.jsonl'

# Open the output file in write mode
with open(output_file, 'w') as file:
    
    ''' 
    # INFO: ======== Raw Data ========
    ''' 
    for state, products in state_product_dict.items():
        for product in products:

            # Filter data for the current state and product
            data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]

            sequence = data_filtered['m3'][:-12].tolist()
            json_line = {'sequence': sequence}

            file.write(json.dumps(json_line) + '\n')
    
    ''' 
    # INFO: ======== Noramlizados ========
    ''' 
    # for state, products in state_product_dict.items():
    #     for product in products:
    #         data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]
            
    #         data = rolling_window(data_filtered['m3'][:-12], 12)
    #         print(data)

    #         sequence = data.values  

    #         print(sequence)
            
    #         scaler = MinMaxScaler(feature_range=(-1, 1))
    #         sequence_scaled = scaler.fit_transform(sequence.reshape(-1, 1)).flatten()
    #         print(sequence_scaled)
            
    #         json_line = {"sequence": sequence_scaled.tolist()} 
            
    #         file.write(json.dumps(json_line) + '\n')
    
    ''' 
    # INFO: ======== Rolling Window ========
    ''' 
    # for state, products in state_product_dict.items():
    #     for product in products:
    #         data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]
            
    #         m3_values = data_filtered['m3'][:-12].values
            
    #         scaler = MinMaxScaler(feature_range=(-1, 1))
    #         m3_scaled = scaler.fit_transform(m3_values.reshape(-1, 1)).flatten()

    #         data = rolling_window(m3_scaled, 12)
            
    #         for _, row in data.iterrows():
    #             print(row.values.tolist())
    #             json_line = {"sequence": row.values.tolist()}  
    #             file.write(json.dumps(json_line) + '\n')
    

print(f"Filtered data has been saved to {output_file}")

Filtered data has been saved to dataset_global/dataset_global.jsonl


### Fine Tuning Product 5 Years

In [None]:
database_path = '../database/combined_data.csv'
all_data = pd.read_csv(database_path, sep=';')
all_data['timestamp'] = pd.to_datetime(all_data['timestamp'], errors='coerce')

output_path = 'dataset_product_5_years/'  
os.makedirs(output_path, exist_ok=True)  

end_date = all_data['timestamp'].max()

for years in range(5, 0, -1):
    yearly_data = all_data[all_data['timestamp'] <= end_date - pd.DateOffset(years=(5 - years))]
    
    for product in yearly_data['product'].unique():
        product_data = yearly_data[yearly_data['product'] == product]
        state_product_dict = {}

        for state in product_data['state'].unique():
            state_product_dict[state] = product
        
        # Criar nome do arquivo com o produto
        product_clean = product.replace(" ", "_").replace("/", "_").replace("\\", "_")  # Evitar problemas com nomes de arquivos
        output_file = os.path.join(output_path, f'dataset_product_{product_clean}_{(end_date - pd.DateOffset(years=(5 - years))).year}.jsonl')

        with open(output_file, 'w') as file:
            for state in product_data['state'].unique():
                data_filtered = product_data[product_data['state'] == state]
                
                sequence = data_filtered['m3'][:-12].tolist()
                
                if sequence:
                    json_line = {
                        'sequence': sequence
                    }
                    file.write(json.dumps(json_line) + '\n')

print("Data has been successfully saved by product and year.")

Data has been successfully saved by product and year.


### Fine Tuning Product 

In [17]:
database_path = '../database/combined_data.csv'
all_data = pd.read_csv(database_path, sep=';')
all_data['timestamp'] = pd.to_datetime(all_data['timestamp'], errors='coerce')

output_path = 'dataset_product/'  
os.makedirs(output_path, exist_ok=True)  

for product in all_data['product'].unique():
    product_data = all_data[all_data['product'] == product]
    state_product_dict = {}

    for state in product_data['state'].unique():
        state_product_dict[state] = product
    
    # Criar nome do arquivo com o produto
    product_clean = product.replace(" ", "_").replace("/", "_").replace("\\", "_")  # Evitar problemas com nomes de arquivos
    output_file = os.path.join(output_path, f'dataset_product_{product_clean}.jsonl')

    with open(output_file, 'w') as file:
        for state in product_data['state'].unique():
            data_filtered = product_data[product_data['state'] == state]
            
            sequence = data_filtered['m3'][:-12].tolist()
            
            if sequence:
                json_line = {
                    'sequence': sequence
                }
                file.write(json.dumps(json_line) + '\n')

print("Data has been successfully saved by product and year.")

Data has been successfully saved by product and year.


### Fine Tuning Indiv 5 Years

In [None]:
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

all_data['timestamp'] = pd.to_datetime(all_data['timestamp'], errors='coerce')

output_base_path = 'dataset_individual_5_anos/'  
os.makedirs(output_base_path, exist_ok=True)  

end_date = all_data['timestamp'].max()

for years in range(5, 0, -1):
    yearly_data = all_data[all_data['timestamp'] <= end_date - pd.DateOffset(years=(5-years))]

    state_product_dict = {}

    for state in yearly_data['state'].unique():
        products = yearly_data[yearly_data['state'] == state]['product'].unique()
        state_product_dict[state] = list(products)

    for state, products in state_product_dict.items():
        for product in products:
            data_filtered = yearly_data[(yearly_data['state'] == state) & (yearly_data['product'] == product)]

            sequence = data_filtered['m3'][:-12].tolist()

            if sequence: 
                json_line = {
                    'sequence': sequence
                }

                year = (end_date - pd.DateOffset(years=(5-years))).year
                output_path = os.path.join(output_base_path, f'dataset_individual_{year}')
                os.makedirs(output_path, exist_ok=True) 

                output_file = os.path.join(output_path, f'dataset_{state}_{product}.jsonl')

                with open(output_file, 'w') as file:
                    file.write(json.dumps(json_line) + '\n')

print(f"Individual files have been successfully saved in {output_base_path}")

### Fine Tuning Indiv

In [16]:
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

all_data['timestamp'] = pd.to_datetime(all_data['timestamp'], errors='coerce')

output_base_path = 'dataset_individual/'  
os.makedirs(output_base_path, exist_ok=True)  

state_product_dict = {}

for state in all_data['state'].unique():
    products = all_data[all_data['state'] == state]['product'].unique()
    state_product_dict[state] = list(products)

for state, products in state_product_dict.items():
    for product in products:
        data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]

        sequence = data_filtered['m3'][:-12].tolist()

        if sequence: 
            json_line = {
                'sequence': sequence
            }

            output_file = os.path.join(output_base_path, f'dataset_{state}_{product}.jsonl')

            with open(output_file, 'w') as file:
                file.write(json.dumps(json_line) + '\n')

print(f"Individual files have been successfully")

Individual files have been successfully
