<a href="https://colab.research.google.com/github/RyuichiSaito1/covid19-twitter-usa-restoring/blob/main/gpt_3_5_turbo_accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade openai

from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

In [None]:
import os

# https://community.openai.com/t/google-colab-fine-tuning-error/5917
# Add Open AI API Key

# GPT-3.5 Turbo Fine-tuning model

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

# Function to read data from TSV file using pandas
def read_tsv(file_path):
    data = pd.read_table(file_path, names=['content', 'label'], dtype='object', engine='python')
    return data

# Test data file path (Replace with your Google Drive directory and file)
file_path = '/content/drive/My Drive/covid-twitter-usa-normal/data/training_data/gpt-3.5/test_data_2021_shuffle_majority_vote_gpt3.5.tsv'

# Read data from TSV file using pandas
test_data = read_tsv(file_path)

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="ft:gpt-3.5-turbo-1106:university-of-tsukuba::8RcZEQZm",
        messages=[
          {"role": "system", "content": "You are a sentiment classifier of tweets from citizens of a large US city collected during the COVID-19 pandemic. Classify into three values: 0 for positive, 1 for neutral, and 2 for negative. Positive sentiment include: admire, amazing, assure, celebration, charm, eager, enthusiastic, excellent, fancy, fantastic, frolic, graceful, happy, joy, luck, majesty, mercy, nice, patience, perfect, proud, rejoice, relief, respect, satisfactorily, sensational, super, terrific, thank, vivid, wise, wonderful, zest, expectations and etc. Negative sentiment include: abominable, anger, anxious, bad, catastrophe, cheap, complaint, condescending, deceit, defective, disappointment, embarrass, fake, fear, filthy, fool, guilt, hate, idiot, inflict, lazy, miserable, mourn, nervous, objection, pest, plot, reject, scream, silly, terrible, unfriendly, vile, wicked, and etc. Neutral sentiment: neither positive or negative, such as text without sentiment, stating a fact, question, news article, advertisement, solicitation, request, quote, unintelligible text, and etc. When the sentiment is mixed, such as both joy and sadness, use your judgment to choose the stronger emotion."},
          {"role": "user", "content": text}
        ]
    )

    print("Pridiction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

# Lists to store true and predicted labels
true_labels = []
predicted_labels = []

# Calculate accuracy with a 2-second interval between classify_sentiment calls
for _, example in test_data.iterrows():
    text = example['content']
    true_label = int(example['label'])
    print("Ground Truth: " + example['label'])

    # Call classify_sentiment with a 2-second interval (Note: This code won't run as OpenAI API access is not provided)
    sleep(2)
    predicted_label = classify_sentiment(text)

    # Skip the prediction if it is None
    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average=None)
precision = precision_score(true_labels, predicted_labels, average=None)
f1 = f1_score(true_labels, predicted_labels, average=None)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))

# Display metrics for each class and macro/micro averages
macro_avg = precision.mean(), recall.mean(), f1.mean()
micro_avg = precision.sum() / 3, recall.sum() / 3, f1.sum() / 3

# Display metrics for each class and macro/micro averages
print("+--------------+-----------+----------+----------+----------+")
print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
print("+--------------+-----------+----------+----------+----------+")
for i in range(3):
    print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
print("+--------------+-----------+----------+----------+----------+")
print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
print("+--------------+-----------+----------+----------+----------+")
print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
print("+--------------+-----------+----------+----------+----------+")


Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       270
           1       0.74      0.57      0.64       214
           2       0.83      0.91      0.86       264

    accuracy                           0.80       748
   macro avg       0.79      0.79      0.79       748
weighted avg       0.80      0.80      0.80       748


Confusion Matrix:
[[239  24   7]
 [ 49 122  43]
 [  6  19 239]]

# GPT-3.5 Turbo Non-fine-tuning model - Rich Backgourd

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

# Function to read data from TSV file using pandas
def read_tsv(file_path):
    data = pd.read_table(file_path, names=['content', 'label'], dtype='object', engine='python')
    return data

# Test data file path (Replace with your Google Drive directory and file)
file_path = '/content/drive/My Drive/covid-twitter-usa-normal/data/training_data/gpt-3.5/test_data_2021_shuffle_majority_vote_gpt3.5.tsv'

# Read data from TSV file using pandas
test_data = read_tsv(file_path)

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
          {"role": "system", "content": "You are a sentiment classifier of tweets from citizens of a large US city collected during the COVID-19 pandemic. Classify into three values: 0 for positive, 1 for neutral, and 2 for negative. Please do not answer anything other than 0, 1, or 2. Positive sentiment include: admire, amazing, assure, celebration, charm, eager, enthusiastic, excellent, fancy, fantastic, frolic, graceful, happy, joy, luck, majesty, mercy, nice, patience, perfect, proud, rejoice, relief, respect, satisfactorily, sensational, super, terrific, thank, vivid, wise, wonderful, zest, expectations and etc. Negative sentiment include: abominable, anger, anxious, bad, catastrophe, cheap, complaint, condescending, deceit, defective, disappointment, embarrass, fake, fear, filthy, fool, guilt, hate, idiot, inflict, lazy, miserable, mourn, nervous, objection, pest, plot, reject, scream, silly, terrible, unfriendly, vile, wicked, and etc. Neutral sentiment: neither positive or negative, such as text without sentiment, stating a fact, question, news article, advertisement, solicitation, request, quote, unintelligible text, and etc. When the sentiment is mixed, such as both joy and sadness, use your judgment to choose the stronger emotion."},
          {"role": "user", "content": text}
        ]
    )

    print("Pridiction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

# Lists to store true and predicted labels
true_labels = []
predicted_labels = []

# Calculate accuracy with a 2-second interval between classify_sentiment calls
for _, example in test_data.iterrows():
    text = example['content']
    true_label = int(example['label'])
    print("Ground Truth: " + example['label'])

    # Call classify_sentiment with a 2-second interval (Note: This code won't run as OpenAI API access is not provided)
    sleep(2)
    predicted_label = classify_sentiment(text)

    # Skip the prediction if it is None
    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average=None)
precision = precision_score(true_labels, predicted_labels, average=None)
f1 = f1_score(true_labels, predicted_labels, average=None)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))

# Display metrics for each class and macro/micro averages
macro_avg = precision.mean(), recall.mean(), f1.mean()
micro_avg = precision.sum() / 3, recall.sum() / 3, f1.sum() / 3

# Display metrics for each class and macro/micro averages
print("+--------------+-----------+----------+----------+----------+")
print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
print("+--------------+-----------+----------+----------+----------+")
for i in range(3):
    print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
print("+--------------+-----------+----------+----------+----------+")
print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
print("+--------------+-----------+----------+----------+----------+")
print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
print("+--------------+-----------+----------+----------+----------+")


Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       270
           1       0.73      0.48      0.58       214
           2       0.73      0.90      0.81       265

    accuracy                           0.75       749
   macro avg       0.75      0.74      0.73       749
weighted avg       0.75      0.75      0.74       749


Confusion Matrix:
[[224  19  27]
 [ 50 102  62]
 [  7  19 239]]

# GPT-3.5 Turbo Non-fine-tuning model - Minimum Backgroud

In [None]:
from openai import OpenAI
client = OpenAI()
import pandas as pd
from time import sleep
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

# Function to read data from TSV file using pandas
def read_tsv(file_path):
    data = pd.read_table(file_path, names=['content', 'label'], dtype='object', engine='python')
    return data

# Test data file path (Replace with your Google Drive directory and file)
file_path = '/content/drive/My Drive/covid-twitter-usa-normal/data/training_data/gpt-3.5/test_data_2021_shuffle_majority_vote_gpt3.5.tsv'

# Read data from TSV file using pandas
test_data = read_tsv(file_path)

# Function to classify sentiment using OpenAI API
def classify_sentiment(text):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[
          {"role": "system", "content": "You are a sentiment classifier of tweets from citizens of a large US city collected during the COVID-19 pandemic. Classify into three values: 0 for positive, 1 for neutral, and 2 for negative. Please do not answer anything other than 0, 1, or 2."},
          {"role": "user", "content": text}
        ]
    )

    print("Pridiction: " + response.choices[0].message.content)
    print("")
    try:
        return int(response.choices[0].message.content)
    except ValueError:
        print(f"Error: Unable to convert '{response.choices[0].message.content}' to int. Skipping this prediction.")
        return None

# Lists to store true and predicted labels
true_labels = []
predicted_labels = []

# Calculate accuracy with a 2-second interval between classify_sentiment calls
for _, example in test_data.iterrows():
    text = example['content']
    true_label = int(example['label'])
    print("Ground Truth: " + example['label'])

    # Call classify_sentiment with a 2-second interval (Note: This code won't run as OpenAI API access is not provided)
    sleep(2)
    predicted_label = classify_sentiment(text)

    # Skip the prediction if it is None
    if predicted_label is not None:
        predicted_labels.append(predicted_label)
        true_labels.append(true_label)

# Calculate and display accuracy, recall, precision, and F1 score
accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average=None)
precision = precision_score(true_labels, predicted_labels, average=None)
f1 = f1_score(true_labels, predicted_labels, average=None)

# Display classification report and confusion matrix
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))
print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))

# Display metrics for each class and macro/micro averages
macro_avg = precision.mean(), recall.mean(), f1.mean()
micro_avg = precision.sum() / 3, recall.sum() / 3, f1.sum() / 3

# Display metrics for each class and macro/micro averages
print("+--------------+-----------+----------+----------+----------+")
print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
print("+--------------+-----------+----------+----------+----------+")
for i in range(3):
    print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
print("+--------------+-----------+----------+----------+----------+")
print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
print("+--------------+-----------+----------+----------+----------+")
print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
print("+--------------+-----------+----------+----------+----------+")

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       270
           1       0.76      0.32      0.45       214
           2       0.71      0.94      0.81       265

    accuracy                           0.74       749
   macro avg       0.74      0.71      0.69       749
weighted avg       0.74      0.74      0.71       749


Confusion Matrix:
[[234  13  23]
 [ 65  69  80]
 [  7   9 249]]