<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/src/convert_tsv_to_jsonl_for_gemini_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import os

# --- Configuration ---
#  IMPORTANT: EDIT THIS PATH to the folder in your Google Drive
#  where your CSV files are located.
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/world-inflation/data/reddit/production/'

# --- Define full file paths ---
TRAINING_CSV_FILE = os.path.join(DRIVE_FOLDER_PATH, 'training-data-1040.csv')
VALIDATION_CSV_FILE = os.path.join(DRIVE_FOLDER_PATH, 'validation-data-1040.csv')

# The output files will be saved in the same Drive folder.
OUTPUT_TRAINING_JSONL = os.path.join(DRIVE_FOLDER_PATH, 'training-data-1040-for-gemini.jsonl')
OUTPUT_VALIDATION_JSONL = os.path.join(DRIVE_FOLDER_PATH, 'validation-data-1040-for-gemini.jsonl')


# --- The Prompt Template ---
# This is the detailed prompt you want to bake into your data.
def apply_economist_prompt_template(text):
    """Wraps the input text with the detailed IMF economist instruction."""
    return f"""You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories:
0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s market.
2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble.
1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text.

Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post:
'{text}'

Classification (0, 1, or 2):"""

# --- Main Conversion Logic ---
def convert_csv_to_prompted_jsonl(source_csv_path, output_jsonl_path):
    """Reads a CSV from Drive, applies the prompt, and saves the JSONL back to Drive."""
    try:
        # Read the source CSV from Google Drive
        df = pd.read_csv(source_csv_path)
        print(f"Reading '{source_csv_path}' with {len(df)} rows...")

        # Apply the prompt template to the 'body' column
        df['body'] = df['body'].apply(apply_economist_prompt_template)

        # Rename columns to what the API expects
        df_renamed = df.rename(columns={'body': 'input_text', 'inflation': 'output'})

        # Save the result to a JSONL file in Google Drive
        df_renamed.to_json(output_jsonl_path, orient='records', lines=True, force_ascii=False)
        print(f"-> Successfully created '{output_jsonl_path}'")

    except FileNotFoundError:
        print(f"ERROR: File not found -> '{source_csv_path}'. Please make sure the path and filename are correct.")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- Run the conversion for both files ---
print("\nStarting conversion process...")
convert_csv_to_prompted_jsonl(TRAINING_CSV_FILE, OUTPUT_TRAINING_JSONL)
convert_csv_to_prompted_jsonl(VALIDATION_CSV_FILE, OUTPUT_VALIDATION_JSONL)
print("\nConversion complete!")
print("The final .jsonl files have been saved to your specified Google Drive folder.")

In [None]:
import pandas as pd
import os
import json # <-- Added the json library for correct file writing

# --- Configuration ---
# Your provided file paths are used here.
DRIVE_FOLDER_PATH = '/content/drive/MyDrive/world-inflation/data/reddit/production/'

# --- Define full file paths ---
TRAINING_CSV_FILE = os.path.join(DRIVE_FOLDER_PATH, 'training-data-65.csv')
VALIDATION_CSV_FILE = os.path.join(DRIVE_FOLDER_PATH, 'validation-data-65.csv')

# The output files will be saved in the same Drive folder.
OUTPUT_TRAINING_JSONL = os.path.join(DRIVE_FOLDER_PATH, 'training-data-65-for-gemini.jsonl')
OUTPUT_VALIDATION_JSONL = os.path.join(DRIVE_FOLDER_PATH, 'validation-data-65-for-gemini.jsonl')


# --- The Prompt Template (No changes here) ---
# This is the detailed prompt you want to bake into your data.
def apply_economist_prompt_template(text):
    """Wraps the input text with the detailed IMF economist instruction."""
    return f"""You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories:
0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s market.
2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble.
1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text.

Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post:
'{text}'

Classification (0, 1, or 2):"""

# --- MODIFIED Main Conversion Logic ---
# This function now creates the required 'contents' field structure.
def convert_csv_to_gemini_jsonl(source_csv_path, output_jsonl_path):
    """Reads a CSV and saves it as the definitive Gemini conversational JSONL format."""
    try:
        df = pd.read_csv(source_csv_path)
        print(f"Reading '{source_csv_path}' with {len(df)} rows...")

        # Open the output file for writing
        with open(output_jsonl_path, 'w', encoding='utf-8') as f:
            for index, row in df.iterrows():
                # Get the raw text and label from the CSV row
                # Use .get() to avoid errors if a column is missing, though they should exist
                raw_text = row.get('body', '')
                label = str(row.get('inflation', '')) # Ensure label is a string

                # Apply the prompt template to the raw text
                prompt_text = apply_economist_prompt_template(raw_text)

                # Build the required dictionary structure with the 'contents' field
                json_structure = {
                    "contents": [
                        {
                            "role": "user",
                            "parts": [{"text": prompt_text}]
                        },
                        {
                            "role": "model",
                            "parts": [{"text": label}]
                        }
                    ]
                }

                # Write the JSON object as a string on a new line
                f.write(json.dumps(json_structure, ensure_ascii=False) + '\n')

        print(f"-> Successfully created '{output_jsonl_path}'")

    except FileNotFoundError:
        print(f"ERROR: File not found -> '{source_csv_path}'. Please make sure the path and filename are correct.")
    except Exception as e:
        print(f"An error occurred: {e}")

# --- Run the conversion for both files ---
# NOTE: The function being called is now the modified one.
print("\nStarting conversion process to the correct Gemini format...")
convert_csv_to_gemini_jsonl(TRAINING_CSV_FILE, OUTPUT_TRAINING_JSONL)
convert_csv_to_gemini_jsonl(VALIDATION_CSV_FILE, OUTPUT_VALIDATION_JSONL)
print("\nConversion complete!")
print("The final .jsonl files have been saved to your specified Google Drive folder.")
print("You can now upload these files to Google Cloud Storage for tuning.")