<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/src/convert_tsv_to_jsonl_for_gpt4_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install --upgrade openai

!pip install pandas

In [None]:
import json
import pandas as pd

# ☆
# Path to the CSV file on Google Drive
tsv_file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/validation-data-65.csv'

# ☆
# Path to save the JSONL file
jsonl_file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/validation-data-65.jsonl'

def convert_to_new_format(prompt, completion):
    # Convert the data to the new JSONL format
    new_data = {
        "messages": [
            {"role": "system", "content": "You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s  market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1."},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": str(completion)} # Surround completion with double quotes
        ]
    }
    return new_data

# Read the CSV file, using 'body' as 'Prompt' and 'inflation_score' as 'Completion'
df = pd.read_table(tsv_file_path, sep=',', usecols=['body', 'inflation'])

# Rename columns to match desired names
df = df.rename(columns={'body': 'Prompt', 'inflation': 'Completion'})

# Display the number of rows before deduplication
print(f"Input data count (before deduplication): {len(df)}")

# Display count before dropping duplicates
before_dedup_count = len(df)
print(f"Count before drop_duplicates: {before_dedup_count}")

# Remove duplicates based on the 'Prompt' and 'Completion' columns
df = df.drop_duplicates(subset=['Prompt', 'Completion'])

# Display count after dropping duplicates
after_dedup_count = len(df)
print(f"Count after drop_duplicates: {after_dedup_count}")

# Display the number of rows after deduplication
print(f"Output data count (after deduplication): {len(df)}")

# Shuffle the data frame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Convert to JSONL format and save
top_10_data = []
with open(jsonl_file_path, 'w') as jsonl_file:
    for index, row in df.iterrows():
        prompt_text = row['Prompt']
        completion_text = row['Completion']

        new_data = convert_to_new_format(prompt_text, completion_text)

        # Write each data as a line in the JSONL file with ensure_ascii=False
        jsonl_file.write(json.dumps(new_data, ensure_ascii=False) + '\n')

        # Collect data for top 10 output
        top_10_data.append((prompt_text, completion_text))

print(f"Conversion completed. JSONL file saved at: {jsonl_file_path}")

# Display the top 10 data
print("Top 10 Data:")
for i, (prompt, completion) in enumerate(top_10_data[:10]):
    print(f"{i + 1}. Prompt: {prompt}\n   Completion: {completion}\n")