# Code to prepare the dataset for fine-tuning in jsonl format

In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
def rolling_window(series, window):
    """
    Generate rolling window data for time series analysis.

    Parameters:
    - series: array-like, time series data
    - window: int, size of the rolling window

    Returns:
    - df: pandas DataFrame, containing the rolling window data
    - scaler: MinMaxScaler object, used for normalization
    """
    data = []

    for i in range(len(series) - window):
        example = np.array(series[i:i + window])
        data.append(example)

    df = pd.DataFrame(data)
    df = df.dropna()
    return df

In [3]:
# Load the combined dataset
all_data = pd.read_csv('../database/combined_data.csv', sep=";")

# Initialize a dictionary to store products for each state
state_product_dict = {}

# Iterate over unique states
for state in all_data['state'].unique():
    # Filter products corresponding to this state
    products = all_data[all_data['state'] == state]['product'].unique()
    # Add to the dictionary
    state_product_dict[state] = list(products)

# Define the output filename
output_file = 'dataset_test_name.jsonl'

# Open the output file in write mode
with open(output_file, 'w') as file:
    # Loop through each state and its products
    for state, products in state_product_dict.items():
        for product in products:
            # Filter data for the current state and product
            data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]

            sequence = data_filtered['m3'][:-12].tolist()

            # Create a key for the JSON entry combining product and state
            json_line = {f'{state}_{product}': sequence}

            # Write the formatted JSON line to the .jsonl file
            file.write(json.dumps(json_line) + '\n')
    
    # for state, products in state_product_dict.items():
    #     for product in products:
    #         # Filtrar os dados para o estado e produto atuais
    #         data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]
            
    #         data = rolling_window(data_filtered['m3'][:-12], 12)
    #         print(data)

    #         # Selecionar a sequência de interesse
    #         sequence = data.values  # Certifica-se de trabalhar com um array NumPy

    #         print(sequence)
            
    #         scaler = MinMaxScaler(feature_range=(-1, 1))
    #         sequence_scaled = scaler.fit_transform(sequence.reshape(-1, 1)).flatten()  # Transforma e achata para 1D
    #         print(sequence_scaled)
            
    #         # Criar a entrada JSON combinando produto e estado
    #         json_line = {"sequence": sequence_scaled.tolist()}  # Converte para lista para JSON
            
    #         # Escrever a linha formatada no arquivo .jsonl
    #         file.write(json.dumps(json_line) + '\n')
    
    # for state, products in state_product_dict.items():
    #     for product in products:
    #         # Filtrar os dados para o estado e produto atuais
    #         data_filtered = all_data[(all_data['state'] == state) & (all_data['product'] == product)]
            
    #         # Extrair a coluna de interesse
    #         m3_values = data_filtered['m3'][:-12].values
            
    #         # Escalar os dados do conjunto inteiro
    #         scaler = MinMaxScaler(feature_range=(-1, 1))
    #         m3_scaled = scaler.fit_transform(m3_values.reshape(-1, 1)).flatten()
            
    #         # Criar janelas deslizantes (tamanho 12) a partir dos dados escalonados
    #         data = rolling_window(m3_scaled, 12)
            
    #         # Salvar cada sequência individualmente no JSONL
    #         for _, row in data.iterrows():
    #             print(row.values.tolist())
    #             json_line = {"sequence": row.values.tolist()}  # Converte a sequência para lista
    #             file.write(json.dumps(json_line) + '\n')
    

print(f"Filtered data has been saved to {output_file}")


Filtered data has been saved to dataset_test_name.jsonl
