In [19]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv('./data/NYT_dataset.csv')
x_tr, x_val, y_tr, y_val = train_test_split(df['abstract'].to_numpy(), df['title'].to_numpy(), test_size=0.1, random_state=26, shuffle=True)

system_prompt: str = "Given the abstract, generate the most likely New York Times title."


def create_jsonl_file(data: list[tuple[str, str]], filename: str) -> None:
    """
    Creates or appends to a .jsonl file with the given data. If the file does not exist, it is created.

    Parameters:
    - data: A list of tuples containing pairs of abstracts and titles.
    - filename: The name of the file to be created or appended to.
    """
    file_path = f'./data/{filename}.jsonl'
    with open(file_path, 'a', encoding='utf-8') as file:
        for abstract, title in data:
            if pd.isna(abstract) or pd.isna(title):
                continue
            entry = {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": abstract},
                    {"role": "assistant", "content": title}
                ]
            }
            file.write(json.dumps(entry) + "\n")

# Splitting the dataset into training and testing datasets
train_data = list(zip(x_tr, y_tr))
test_data = list(zip(x_val, y_val))

# Generating or updating the train.jsonl and test.jsonl files
create_jsonl_file(train_data, 'train')
create_jsonl_file(test_data, 'test')


In [7]:
def create_synthetic_jsonl_file(data: list[tuple[str, str]], filename: str) -> None:
    """
    Creates or appends to a synthetic .jsonl file with the given data in a synthetic manner. 
    If the file does not exist, it is created.

    Parameters:
    - data: A list of tuples containing pairs of abstracts and titles.
    - filename: The name of the file to be created or appended to, within the synthetic directory.
    """
    synthetic_file_path = f'./synthetic/{filename}.jsonl'
    with open(synthetic_file_path, 'a', encoding='utf-8') as file:
        for abstract, title in data:
            synthetic_entry = {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": abstract},
                    {"role": "assistant", "content": title}
                ]
            }
            file.write(json.dumps(synthetic_entry, ensure_ascii=False) + "\n")

# Load synthetic dataset from .jsonl
def load_synthetic_data(file_path: str) -> list[tuple[str, str]]:
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            entry = json.loads(line)
            abstract = entry['abstract']
            title = entry['title']
            data.append((abstract, title))
    return data

synthetic_data = load_synthetic_data('./synthetic/wikinews_synthetic_data.jsonl')

# Splitting the synthetic dataset into training dataset only
# Assuming the entire dataset is for training as per the original code's test_size=0.0
synthetic_train_data = synthetic_data

# Generating or updating the synthetic train.jsonl file
create_synthetic_jsonl_file(synthetic_train_data, 'synthetic_train')


In [22]:
def create_limited_jsonl_file(source_file_path: str, target_file_path: str, limit: int = 1000) -> None:
    """
    Creates a new .jsonl file from the first 'limit' entries of an existing .jsonl file.

    Parameters:
    - source_file_path: The path to the source .jsonl file.
    - target_file_path: The path where the new limited .jsonl file will be created.
    - limit: The maximum number of entries to include in the new file.
    """
    with open(source_file_path, 'r', encoding='utf-8') as source_file, \
         open(target_file_path, 'w', encoding='utf-8') as target_file:
        for i, line in enumerate(source_file):
            if i < limit:
                target_file.write(line)
            else:
                break

# Creating a limited version of the synthetic train.jsonl file with only the first 1000 rows
create_limited_jsonl_file('./data/train.jsonl', './data/train_limited.jsonl', 1000)

# Creating a limited version of the synthetic test.jsonl file with only the first 1000 rows
create_limited_jsonl_file('./data/test.jsonl', './data/test_limited.jsonl', 100)


In [20]:
import json

def calculate_total_tokens(file_path: str) -> int:
    """
    Calculates the total number of tokens for all messages in a .jsonl file.

    Parameters:
    - file_path: The path to the .jsonl file containing the messages.

    Returns:
    - The total number of tokens for all messages in the file.
    """
    total_tokens = 0

    encoding = tiktoken.get_encoding("cl100k_base")
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            entry = json.loads(line)
            messages = entry["messages"]
            for message in messages:
                for key, value in message.items():
                    if isinstance(value, str):
                        total_tokens += len(encoding.encode(value))

    return total_tokens

calculate_total_tokens('./data/train.jsonl')

5522143

In [2]:
# Creates a csv file for 1000 examples with the fine-tuned GPT-3.5 model with the headers 'Model', 'Abstract', 'True Title', and 'Predicted Title'
# ft:gpt-3.5-turbo-0125:personal:nyt-dataset:98g0EH1s

import csv
import json
from dotenv import load_dotenv
from openai import OpenAI

def generate_titles(model: str, dataset_path: str, output_csv_path: str) -> None:
    """
    Generates predicted titles for all abstracts in the dataset using the specified fine-tuned GPT-3.5 model.

    Parameters:
    - model: The model identifier for the fine-tuned GPT-3.5 model.
    - dataset_path: The path to the dataset file containing abstracts.
    - output_csv_path: The path where the output CSV file will be saved.
    """
    load_dotenv()

    client = OpenAI()

    with open(dataset_path, 'r', encoding='utf-8') as dataset_file, open(output_csv_path, 'a+', newline='', encoding='utf-8') as output_file:
        output_file.seek(0)  # Move to the start of the file before reading
        csv_writer = csv.writer(output_file)
        csv_writer.writerow(['Model', 'Abstract', 'True Title', 'Predicted Title'])

        for line in dataset_file:
            entry = json.loads(line)
            messages = entry["messages"]
            system_message = ""
            user_message = ""
            true_title = ""
            for message in messages:
                if message["role"] == "system":
                    system_message = message["content"]
                elif message["role"] == "user":
                    user_message = message["content"]
                elif message["role"] == "assistant":
                    true_title = message["content"]

            prompt = system_message + "\n" + user_message
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ]
            )
            predicted_title = response.choices[0].message.content
            predicted_title = predicted_title.replace('\n', ' ')
            csv_writer.writerow([model, user_message, true_title, predicted_title])

generate_titles('ft:gpt-3.5-turbo-0125:personal:nyt-dataset:98g0EH1s', './data/test_limited.jsonl', './data/predicted_titles.csv')
