In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('../../../data/Fine-Tuning/Updated_Pretraining_Data.csv')

## Generating JSONL file from table

In [3]:
import json

# Define a function to create the message format for each row
def create_message(row):
    system_content = "This model is trained to analyze the sentiment of news articles concerning the LNG market and predict their impact on the LNG index's opening price and volatility for the following day. Please provide estimates for the expected return (ranging from -10.0 to +10.0), the volatility effect (ranging from -10.0 to +10.0), and the duration of the impact (scaled from 0 to 10, where 0 represents no impact and 10 represents a permanent impact). Include a comment explaining the rationale behind your predictions. Consider long-term market trends, seasonal variations, global supply-demand dynamics, and macroeconomic factors that might influence LNG prices and market behavior."
    
    user_content = {
        "date": row["Timestamp"],
        "title": row["Title"],
        "summary": row["Summary"],
        "price": row["last_price"],
        "vol_annual": row["volatility_annual"]
    }

    assistant_content = {
        "return": row["T+3 return_normalized"],
        "vol": row["volatility_annual_normalized"],
        "duration": row["duration_estimated"],
        "comment": ""
    }

    message = {
        "messages": [
            {"role": "system", "content": system_content},
            {"role": "user", "content": json.dumps(user_content)},
            {"role": "assistant", "content": json.dumps(assistant_content)}
        ]
    }
    return message

# Generate messages for each row in the dataframe
messages = data.apply(create_message, axis=1).tolist()

# Write messages to a JSONL file
jsonl_file_path = './LNG_unsupervised_full.jsonl'
with open(jsonl_file_path, 'w') as outfile:
    for message in messages:
        json.dump(message, outfile)
        outfile.write('\n')

jsonl_file_path

'./test.jsonl'

## Uploading JSONL file to OpenAI for fine tuning

In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
import os
from openai import OpenAI

api_key = os.getenv("OpenAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key
client = OpenAI()

client.files.create(
  file=open("LNG_unsupervised.jsonl", "rb"),
  purpose="fine-tune"
)

## Starting the fine tuning job

In [None]:
file_key = os.getenv("Unsupervised_Training_File")
client.fine_tuning.jobs.create(
  training_file="", 
  model="gpt-3.5-turbo"
)

## Checking job status

In [None]:
# List 10 fine-tuning jobs
client.fine_tuning.jobs.list(limit=10)

In [None]:
job_key = os.getenv("Unsupervised_Fine_Tuning_Job")
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve(job_key)

# Cancel a job
# client.fine_tuning.jobs.cancel("ftjob-abc123")

# List up to 10 events from a fine-tuning job
# client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-abc123", limit=10)

# Delete a fine-tuned model (must be an owner of the org the model was created in)
# client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:abc123")

## Start to genrate output!

In [15]:
model_id = os.getenv("Unsupervised_Fine_Tuning_Model")
completion = client.chat.completions.create(
  model=model_id,
  messages=[
    {"role": "system", "content": "This model is trained to analyze the sentiment of news articles concerning the LNG market and predict their impact on the LNG index's opening price and volatility for the following day. Please provide estimates for the expected return (ranging from -10.0 to +10.0), the volatility effect (ranging from -10.0 to +10.0), and the duration of the impact (scaled from 0 to 10, where 0 represents no impact and 10 represents a permanent impact). Include a comment explaining the rationale behind your predictions. Consider long-term market trends, seasonal variations, global supply-demand dynamics, and macroeconomic factors that might influence LNG prices and market behavior."},
    {"role": "user", "content": "{\"date\": \"2022-07-19\", \"title\": \"Russian Gas Supplies to Europe Aren\\u2019t Expected to Restart\", \"summary\": \"Europe is working on contingency plans for the possibility that the Nord Stream pipeline won\\u2019t return to operation.\", \"price\": 38.372, \"vol_annual\": 9.020759401473269}"}
  ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='{"return": -3.194664175893563, "vol": -9.367349168765628, "duration": 7.555806724871983, "comment": ""}', role='assistant', function_call=None, tool_calls=None)


```text
This model is trained to analyze the sentiment of news articles concerning the LNG market and predict the impact on the LNG index's price. Predictions are made based on the news content, current price, and current volatility on the provided date. The model outputs include the direction of the price movement (true for higher, false for lower), the magnitude of this change (ranging from 0.0 to 10.0), and a comment explaining the rationale behind these predictions. The results are provided in the following JSON format:
{
  "direction": "boolean",
  "magnitude": "float",
  "comment": "string"
}
```

In [10]:
import os
import json
from openai import OpenAI

def main():
    
    api_key = os.getenv('OpenAI_API_KEY')
    os.environ["OPENAI_API_KEY"] = api_key
    client = OpenAI()
    
    # Load model ID from environment variable
    model_id = os.getenv("Unsupervised_Fine_Tuning_Model")
    
    # Path to the JSONL input file
    file_path = './LNG_unsupervised_full.jsonl'
    
    # Path for the output file
    output_path = './output/unsupervised_outputs_gp4turbo_05010120.txt'
    
    # Prepare the system message (constant part of the prompt)
    # system_message_content = "This model is trained to analyze the sentiment of news articles concerning the LNG market and predict their impact on the LNG index's opening price and volatility for the following day. Please provide estimates for the expected return (ranging from -10.0 to +10.0), the volatility effect (ranging from -10.0 to +10.0), and the duration of the impact (scaled from 0 to 10, where 0 represents no impact and 10 represents a permanent impact). Include a comment explaining the rationale behind your predictions. Consider long-term market trends, seasonal variations, global supply-demand dynamics, and macroeconomic factors that might influence LNG prices and market behavior."
    system_message_content = "This model is trained to analyze the sentiment of news articles concerning the LNG market and predict the impact on the LNG index's price. Predictions are made based on the news content, current price, and current volatility on the provided date. The model outputs include the direction of the price movement (true for higher, false for lower), the magnitude of this change (ranging from 0.0 to 10.0), and a comment explaining the rationale behind these predictions. The results are provided in the following JSON format: {\"direction\": \"boolean\", \"magnitude\": \"float\",\"comment\": \"string\"}"

    # Open the input file and output file
    with open(file_path, 'r') as infile, open(output_path, 'w') as outfile:
        for line in infile:
            # Parse the JSON line
            data = json.loads(line)
            user_message = json.loads(data['messages'][1]['content'])  # Parse the user message content as JSON
            user_message_content = data['messages'][1]['content']

            # print(user_message_content)
            
            # Create the API call with messages
            completion = client.chat.completions.create(
                model="gpt-4-turbo",
                messages=[
                    {"role": "system", "content": system_message_content},
                    {"role": "user", "content": user_message_content}
                ],
                temperature=0.1,
                top_p=0.5,
            )
            
            # Extract date from the user message
            date = user_message['date']
            
            # Write the model's response to the output file including the date
            outfile.write(f"{date}: {str(completion.choices[0].message)}\n")
            # break

if __name__ == "__main__":
    
    main()


In [7]:
import json
import pandas as pd

# Path to the file
file_path = './output/unsupervised_outputs_04302300.txt'

# Read and parse the data
data = []
with open(file_path, 'r') as file:
    for line in file:
        date_part, message_part = line.split(': ', 1)
        message_json = json.loads(message_part.split("ChatCompletionMessage(content='")[1].split("', role='assistant'")[0])
        data.append({
            'Date': date_part,
            'Return': message_json['return'],
            'Vol': message_json['vol'],
            'Duration': message_json['duration']
        })

# Convert to DataFrame
df = pd.DataFrame(data)
df.to_csv('./output/unsupervised_outputs_04302300.csv', index=False)

In [None]:
import json
import pandas as pd

# Path to the file
file_path = './output/unsupervised_outputs_gp4turbo_05010120.txt'

# def extract_json(line):
#     start = line.find("content='") + len("content='")
#     end = line.rfind("', role='assistant'")
#     json_str = line[start:end]
#     # Fix for JSON decoding errors due to improper escapes
#     json_str = json_str.replace("\\'", "'").replace('\\"', '"')
#     return json.loads(json_str)


data = []
with open(file_path, 'r') as file:
    for line in file:
        line = line.split
        break
        # if line.strip():  # Ensuring the line is not empty
        #     date_part, rest = line.split(': ', 1)
            # try:
            #     message_json = extract_json(rest)
            #     data.append({
            #         'Date': date_part,
            #         'Direction': message_json['direction'],
            #         'Magnitude': message_json['magnitude'],
            #         'Comment': message_json['comment']
            #     })
            # except json.JSONDecodeError as e:
            #     print(f"Failed to decode JSON: {e}")

df = pd.DataFrame(data)

csv_file_path = './output/unsupervised_outputs_gp4turbo_05010120.csv'
df.to_csv(csv_file_path, index=False)

In [28]:
import pandas as pd

file_path = './output/unsupervised_outputs_gp4turbo_05010120.txt'
data = []
with open(file_path, 'r') as file:
    for line in file:
        date = line.split(':')[0]
        direction = line.split("\"direction\": ")[1].split(',')[0]
        magnitude = line.split("\"magnitude\": ")[1].split(',')[0].replace('\"', '')
        comment = line.split("\"comment\": ")[1].split('}')[0].replace('\"', '')
        data.append({'Date': date, 'Direction': direction, 'Magnitude': magnitude, 'Comment': comment})

df = pd.DataFrame(data)

csv_file_path = './output/unsupervised_outputs_gp4turbo_05010120.csv'
df.to_csv(csv_file_path, index=False)

: 