<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/notebooks/gpt_4_1_mini_performances.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

!pip install --upgrade openai

In [None]:
import os

# https://community.openai.com/t/google-colab-fine-tuning-error/5917
os.environ["OPENAI_API_KEY"] = "#####"

# Training + validation: 1243

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

def read_csv_data(file_path):
    data = pd.read_csv(file_path, usecols=['body', 'inflation'], dtype='object', engine='python')
    data = data.rename(columns={'body': 'content', 'inflation': 'label'})
    return data

file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-1243.csv'

test_data_raw = read_csv_data(file_path)

print(f"Input data count: {len(test_data_raw)}")

test_data_deduplicated = test_data_raw.drop_duplicates(subset=['content', 'label'])

print(f"Deduplicated data count: {len(test_data_deduplicated)}")

test_data_shuffled = test_data_deduplicated.sample(frac=1).reset_index(drop=True)
print(f"Shuffled data count: {len(test_data_shuffled)}")

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="ft:gpt-4.1-mini-2025-04-14:university-of-tsukuba:2025-05-16:BfRHWjKq",
        messages=[
          {"role": "system", "content": "You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s  market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1."},
          {"role": "user", "content": text}
        ]
    )

    print("Text being evaluated: " + text)
    print("Prediction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

true_labels = []
predicted_labels = []

for _, example in test_data_shuffled.iterrows():
    text = example['content']

    print(f"\n--- New Evaluation ---")
    print(f"Text: {text}")

    true_label = int(example['label'])

    print("Ground Truth: " + example['label'])

    sleep(2)
    predicted_label = classify_sentiment(text)

    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
if true_labels and predicted_labels:
    accuracy = accuracy_score(true_labels, predicted_labels)
    # Added zero_division=0 to handle cases where a class might not be predicted
    recall = recall_score(true_labels, predicted_labels, average=None, zero_division=0)
    precision = precision_score(true_labels, predicted_labels, average=None, zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average=None, zero_division=0)

    print("\n=== EVALUATION RESULTS ===")
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    macro_avg_precision = precision.mean()
    macro_avg_recall = recall.mean()
    macro_avg_f1 = f1.mean()

    # Micro average for precision, recall, F1 is equivalent to accuracy in multiclass
    micro_avg_precision = precision_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_recall = recall_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_f1 = f1_score(true_labels, predicted_labels, average='micro', zero_division=0)


    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(len(recall)):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {macro_avg_recall:.2f}   |   {macro_avg_precision:.2f}   |   {macro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {micro_avg_recall:.2f}   |   {micro_avg_precision:.2f}   |   {micro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
else:
    print("\nNo predictions were made, or no valid labels were found. Skipping evaluation metrics.")

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

def read_csv_data(file_path):
    data = pd.read_csv(file_path, usecols=['body', 'inflation'], dtype='object', engine='python')
    data = data.rename(columns={'body': 'content', 'inflation': 'label'})
    return data

file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

test_data_raw = read_csv_data(file_path)

print(f"Input data count: {len(test_data_raw)}")

test_data_deduplicated = test_data_raw.drop_duplicates(subset=['content', 'label'])

print(f"Deduplicated data count: {len(test_data_deduplicated)}")

test_data_shuffled = test_data_deduplicated.sample(frac=1).reset_index(drop=True)
print(f"Shuffled data count: {len(test_data_shuffled)}")

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="ft:gpt-4.1-mini-2025-04-14:university-of-tsukuba:2025-05-16:BfRHWjKq",
        messages=[
          {"role": "system", "content": "You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s  market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1."},
          {"role": "user", "content": text}
        ]
    )

    print("Text being evaluated: " + text)
    print("Prediction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

true_labels = []
predicted_labels = []

for _, example in test_data_shuffled.iterrows():
    text = example['content']

    print(f"\n--- New Evaluation ---")
    print(f"Text: {text}")

    true_label = int(example['label'])

    print("Ground Truth: " + example['label'])

    sleep(2)
    predicted_label = classify_sentiment(text)

    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
if true_labels and predicted_labels:
    accuracy = accuracy_score(true_labels, predicted_labels)
    # Added zero_division=0 to handle cases where a class might not be predicted
    recall = recall_score(true_labels, predicted_labels, average=None, zero_division=0)
    precision = precision_score(true_labels, predicted_labels, average=None, zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average=None, zero_division=0)

    print("\n=== EVALUATION RESULTS ===")
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    macro_avg_precision = precision.mean()
    macro_avg_recall = recall.mean()
    macro_avg_f1 = f1.mean()

    # Micro average for precision, recall, F1 is equivalent to accuracy in multiclass
    micro_avg_precision = precision_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_recall = recall_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_f1 = f1_score(true_labels, predicted_labels, average='micro', zero_division=0)


    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(len(recall)):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {macro_avg_recall:.2f}   |   {macro_avg_precision:.2f}   |   {macro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {micro_avg_recall:.2f}   |   {micro_avg_precision:.2f}   |   {micro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
else:
    print("\nNo predictions were made, or no valid labels were found. Skipping evaluation metrics.")

# Training + validation: 65

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

def read_csv_data(file_path):
    data = pd.read_csv(file_path, usecols=['body', 'inflation'], dtype='object', engine='python')
    data = data.rename(columns={'body': 'content', 'inflation': 'label'})
    return data

file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

test_data_raw = read_csv_data(file_path)

print(f"Input data count: {len(test_data_raw)}")

test_data_deduplicated = test_data_raw.drop_duplicates(subset=['content', 'label'])

print(f"Deduplicated data count: {len(test_data_deduplicated)}")

test_data_shuffled = test_data_deduplicated.sample(frac=1).reset_index(drop=True)
print(f"Shuffled data count: {len(test_data_shuffled)}")

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="ft:gpt-4.1-mini-2025-04-14:university-of-tsukuba:2025-06-07:Bfpj9sSP",
        messages=[
          {"role": "system", "content": "You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s  market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1."},
          {"role": "user", "content": text}
        ]
    )

    print("Text being evaluated: " + text)
    print("Prediction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

true_labels = []
predicted_labels = []

for _, example in test_data_shuffled.iterrows():
    text = example['content']

    print(f"\n--- New Evaluation ---")
    print(f"Text: {text}")

    true_label = int(example['label'])

    print("Ground Truth: " + example['label'])

    sleep(2)
    predicted_label = classify_sentiment(text)

    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
if true_labels and predicted_labels:
    accuracy = accuracy_score(true_labels, predicted_labels)
    # Added zero_division=0 to handle cases where a class might not be predicted
    recall = recall_score(true_labels, predicted_labels, average=None, zero_division=0)
    precision = precision_score(true_labels, predicted_labels, average=None, zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average=None, zero_division=0)

    print("\n=== EVALUATION RESULTS ===")
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    macro_avg_precision = precision.mean()
    macro_avg_recall = recall.mean()
    macro_avg_f1 = f1.mean()

    # Micro average for precision, recall, F1 is equivalent to accuracy in multiclass
    micro_avg_precision = precision_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_recall = recall_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_f1 = f1_score(true_labels, predicted_labels, average='micro', zero_division=0)


    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(len(recall)):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {macro_avg_recall:.2f}   |   {macro_avg_precision:.2f}   |   {macro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {micro_avg_recall:.2f}   |   {micro_avg_precision:.2f}   |   {micro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
else:
    print("\nNo predictions were made, or no valid labels were found. Skipping evaluation metrics.")

# Zero-shot

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

def read_csv_data(file_path):
    data = pd.read_csv(file_path, usecols=['body', 'inflation'], dtype='object', engine='python')
    data = data.rename(columns={'body': 'content', 'inflation': 'label'})
    return data

file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

test_data_raw = read_csv_data(file_path) # Renamed variable for clarity

print(f"Input data count: {len(test_data_raw)}")

test_data_deduplicated = test_data_raw.drop_duplicates(subset=['content', 'label'])

print(f"Deduplicated data count: {len(test_data_deduplicated)}")

test_data_shuffled = test_data_deduplicated.sample(frac=1).reset_index(drop=True)
print(f"Shuffled data count: {len(test_data_shuffled)}") # Added a print statement to confirm

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="gpt-4.1-mini-2025-04-14",
        messages=[
          {"role": "system", "content": "You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., “the prices are not bad”), affordable services (e.g., “this champagne is cheap and delicious”), sales information (e.g., “you can get it for only 10 dollars.”), or a declining and buyer’s  market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., “it’s not cheap”), the unreasonable cost of goods or services (e.g., “the food is overpriced and cold”), consumers struggling to afford necessities (e.g., “items are too expensive to buy”), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., “a gorgeous and costly dinner” or “an affordable Civic”), website promotion, authors’ wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1."},
          {"role": "user", "content": text}
        ]
    )

    # Print the text being evaluated
    print("Text being evaluated: " + text)
    print("Prediction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

true_labels = []
predicted_labels = []

for _, example in test_data_shuffled.iterrows():
    text = example['content']

    print(f"\n--- New Evaluation ---")
    print(f"Text: {text}")

    true_label = int(example['label'])

    print("Ground Truth: " + example['label'])

    sleep(2)
    predicted_label = classify_sentiment(text)

    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
if true_labels and predicted_labels:
    accuracy = accuracy_score(true_labels, predicted_labels)
    # Added zero_division=0 to handle cases where a class might not be predicted
    recall = recall_score(true_labels, predicted_labels, average=None, zero_division=0)
    precision = precision_score(true_labels, predicted_labels, average=None, zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average=None, zero_division=0)

    print("\n=== EVALUATION RESULTS ===")
    print("Classification Report:")
    print(classification_report(true_labels, predicted_labels, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    macro_avg_precision = precision.mean()
    macro_avg_recall = recall.mean()
    macro_avg_f1 = f1.mean()

    # Micro average for precision, recall, F1 is equivalent to accuracy in multiclass
    micro_avg_precision = precision_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_recall = recall_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_avg_f1 = f1_score(true_labels, predicted_labels, average='micro', zero_division=0)


    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(len(recall)):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {macro_avg_recall:.2f}   |   {macro_avg_precision:.2f}   |   {macro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {micro_avg_recall:.2f}   |   {micro_avg_precision:.2f}   |   {micro_avg_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
else:
    print("\nNo predictions were made, or no valid labels were found. Skipping evaluation metrics.")