<a href="https://colab.research.google.com/github/TyrealQ/Experience-is-all-you-need_SMR/blob/main/Code_SMR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune GPT-3.5

In [None]:
# Install dependencies
!pip install datasets openai numpy tiktoken

In [None]:
from openai import OpenAI
from google.colab import userdata
import csv
import json
import os
import numpy as np
from datasets import load_dataset
from collections import defaultdict
import tiktoken

In [None]:
# Set up OpenAI API Key securely
api_key = userdata.get('YOUR API KEY')
client = OpenAI.Client(api_key=api_key)

## Format training dataset

In [None]:
# Load CSV data in
csv_file_path = 'YOUR CSV FILE PATH'
cleaned_data = []

with open(csv_file_path, 'r', encoding='utf-8-sig') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        for cell in row:
            try:
                # Replace square brackets and inner double quotes that are problematic
                cell = cell.replace('["', '').replace('"]', '').replace('\\"', '"')

                # Load each cell as a JSON object
                cell_json = json.loads(cell)

                # Now that the content is clean, append to cleaned_data list
                cleaned_data.append(cell_json)
            except json.JSONDecodeError as e:
                print(f"JSON decode error for cell '{cell}': {e}")

jsonl_file_path = 'YOUR OUTPUT FILE PATH'

# Write cleaned data to a JSONL file
with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
    for item in cleaned_data:
        jsonl_file.write(json.dumps(item) + '\n')

## Double-check the training dataset format and calculate the training cost

In [None]:
# From OpenAI website to format data;  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

# Next, we specify the data path and open the JSONL file

data_path = 'YOUR JSONL FILE PATH'

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]

# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 4
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") #I added this for actual cost based on current pricing

## Save the finalized training dataset

In [None]:
# Function to save the dataset as a JSONL file
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

# Specify the path where you want to save the JSONL file in your Google Drive
jsonl_file_path = 'YOUR FILE PATH'
# Save the dataset to the specified file path
save_to_jsonl(dataset, jsonl_file_path)

## Upload the dataset for training

In [None]:
# Upload data for training
training_file_name = 'YOUR FILE PATH'

training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

# Gives training file id
print("Training file id:", training_file_id)

In [None]:
# Create Fine-Tuning Job
suffix_name = "CREATE A NAME FOR YOUR MODEL"

# You can stick to the default hyperparameters or adjust some of them based on your data. For example, I set epochs to 4.

response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo-1106",
    suffix=suffix_name,
    hyperparameters={
    "n_epochs":4
  }
)

job_id = response.id

print(response)

In [None]:
# List events as fine-tuning progresses
response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=50)

events = response.data
events.reverse()

for event in events:
    print(event.message)

In [None]:
# Retrieve fine-tune model id
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuning_job_id = response.fine_tuned_model

print(response)
print("\nFine-tuned model id:", fine_tuning_job_id)

# Running the fine-tuned GPT-3.5 models for AE & CC

## Use the AE model for CX aspect extraction

ft:gpt-3.5-turbo-1106:qian:cxae:8kZXONLd


In [None]:
!pip install openai

In [None]:
import openai
import pandas as pd

# Set up OpenAI API Key securely
from google.colab import userdata
api_key = userdata.get('YOUR API KEY')
client = openai.client(api_key=api_key)

# Load the CSV file
df = pd.read_csv('YOUR FILE PATH')

# Create a new column in the DataFrame to store the results
df['Aspect'] = ''

# Define the system prompt
system_prompt = ("""
    You are an expert in college football, specializing in analyzing Tripadvisor reviews of college football stadiums, focusing on game day experiences.
    Each review consists of sentences that need to be treated individually. Within these sentences, identify various aspects of the game day experience, noting that a single aspect can have multiple descriptions.
    For aspect extraction in each sentence of a review, the process must be followed carefully. Ensure each step is taken one at a time for optimal clarity and results:
    1. Analyze each sentence as a separate unit.
    2. Identify the different aspects of the game day experience mentioned.
    3. For aspects with multiple descriptions within the same sentence, list each description as a separate entry, including the aspect in the detail for clarity.
    4. Extract each aspect and its corresponding detailed description.
    5. Format this information in a JSON structure using '''Aspect_i''' and '''Details_i''' as keys, incrementing '''i''' for each new aspect or unique description.
    """
)

for index, row in df.iterrows():
    if pd.notna(row['Text']):
        conversation = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": row['Text']}
        ]

        try:
            response = client.chat.completions.create(
                model="ft:gpt-3.5-turbo-1106:qian:cxae:8kZXONLd",
                messages=conversation,
                temperature=0,
                max_tokens=2048
            )
                #response_format={ "type": "json_object" }

            # Assuming the response is an object with attributes, we use dot notation
            # If this raises an error, then your environment may be different and require dictionary access
            content = response.choices[0].message.content

            # Debug: Print the response
            print(f"Index {index} - Response: {content}")

            # Assign the response to the DataFrame
            df.at[index, 'Aspect'] = content

        except Exception as e:
            print(f"Error processing index {index}: {e}")
    else:
        print(f"Skipping index {index}: 'Text' column is NaN")

# Debug: Print the DataFrame before saving
print(df.head())

# Save the DataFrame
df.to_csv('YOUR FILE PATH', index=False)

## Use the CC model for CX classification

ft:gpt-3.5-turbo-1106:qian:cxcc:8mNAEN0D

In [None]:
!pip install openai

In [None]:
import openai as OpenAI
import pandas as pd
from google.colab import userdata

# Set up OpenAI API Key securely
api_key = userdata.get('YOUR API KEY')
client = OpenAI.Client(api_key=api_key)

# Load the CSV file
df = pd.read_csv('YOUR FILE PATH')

# Create a new column in the DataFrame to store the results
df['Label1'] = ''

# Define the system prompt
system_prompt = ("""
    You are an expert in college football, specializing in categorizing aspects of the college football game day experience, identified from Tripadvisor online reviews of college football stadiums.
    Each identified aspect ('''Aspect_i''') has a corresponding description ('''Details_i''') that you will use to determine its appropriate category.
    The categories are 'Core', 'Functional', 'Emotional', 'Monetary', 'Social', 'Safety', and 'Others'.
    Your task is to analyze each aspect's description and classify it accordingly.
    Ensure each step is taken one at a time for optimal clarity and results:
    1. Examine the description ('''Details_i''') of each aspect ('''Aspect_i''').
    2. Classify '''Aspect_i''' into one of the following categories based on '''Details_i''':
    'Core': Sports-related attributes (e.g., team dynamics, game quality).
    'Functional': utilitarian services (e.g., facilities, concessions, parking).
    'Emotional': Feelings or emotional states (e.g., excitement, thrill).
    'Monetary': Pricing aspects (e.g., affordability, value for money).
    'Social': Interpersonal and community aspects (e.g., fan interactions, traditions).
    'Safety': Security and safety experiences (e.g., measures, feelings of safety).
    'Others': Aspects unrelated to the college football game day experience or that do not fit into the other categories, such as general comments about the city or weather conditions.
    3. Format this classification into a JSON structure using '''Aspect_i''' and '''Label_i''' as keys, incrementally assigning '''i''' for each aspect.
"""
)

for index, row in df.iterrows():
    if pd.notna(row['Aspect1']):
        conversation = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": row['Aspect1']}
        ]

        try:
            response = client.chat.completions.create(
                model="ft:gpt-3.5-turbo-1106:qian:cxcc:8mNAEN0D",
                messages=conversation,
                temperature=0,
                max_tokens=2048
            )
            #response_format={ "type": "json_object" }

            # Assuming the response is an object with attributes, we use dot notation
            # If this raises an error, then your environment may be different and require dictionary access
            content = response.choices[0].message.content

            # Debug: Print the response
            print(f"Index {index} - Response: {content}")

            # Assign the response to the DataFrame
            df.at[index, 'Label1'] = content

        except Exception as e:
            print(f"Error processing index {index}: {e}")
    else:
        print(f"Skipping index {index}: 'Text' column is NaN")

# Debug: Print the DataFrame before saving
print(df.head())

# Save the DataFrame
df.to_csv('YOUR FILE PATH', index=False)

# RoBERTa sentiment assessment and model evaluation

In [None]:
# Install dependencies
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
# Specify model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

## Sentiment assessment

In [None]:
# Load your original DataFrame
df = pd.read_csv('YOUR FILE PATH')

# Full classification
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'neg': scores[0],
        'neu': scores[1],
        'pos': scores[2]
    }
    return scores_dict

res = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        myid = row['review_id']
        result_dict = {'review_id': myid}  # Initialize result_dict with ID

        for j in range(1, 29):  # This will loop through numbers 1 to 28
            column_name = f'text_{j}'

            # Check for missing values before converting to string
            if pd.isna(row[column_name]):
                roberta_result = {'neg': 0, 'neu': 0, 'pos': 0}
            else:
                text = str(row[column_name])  # Convert to string only if not missing
                roberta_result = polarity_scores_roberta(text)

            # Update result_dict with polarity scores for this column
            result_dict.update({
                f'{column_name}_neg': roberta_result['neg'],
                f'{column_name}_neu': roberta_result['neu'],
                f'{column_name}_pos': roberta_result['pos']
            })

        # Append the result_dict to res list after processing all columns for this row
        res.append(result_dict)
    except Exception as e:  # Catch all exceptions for debugging
        print(f'Broke for id{myid}, column {column_name}, error: {e}')

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(res)

# Ensure the path for saving the CSV file is correct
# ...
results_df.to_csv('YOUR FILE PATH', index=False)

## Model evaluation

In [None]:
# Install dependencies
!pip install transformers

In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [None]:
# 1. Load the data
data_path = 'YOUR EVAL DATA PATH'
df = pd.read_csv(data_path)

# Assuming you have a column named 'Text' with the data and 'Label' with the labels (positive-2, negative-0, and neutral-1)
texts = df['Details'].tolist()
labels = df['Label'].tolist()

# 2. Tokenize the data
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = RobertaTokenizer.from_pretrained(MODEL)
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

# 3. Load the pretrained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained(MODEL)
model.eval()

# 4. Predict the sentiments
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1).tolist()

# 5. Evaluate metrics
conf_matrix = confusion_matrix(labels, predictions)
accuracy = accuracy_score(labels, predictions)
precision = precision_score(labels, predictions, average='weighted')
recall = recall_score(labels, predictions, average='weighted')
f1 = f1_score(labels, predictions, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Binary logistic regression

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
file_path = 'YOUR FILE PATH'
data = pd.read_csv(file_path)

# Independent variables
X = data[['Core_pos_scores', 'Core_neg_scores', 'Functional_pos_scores', 'Functional_neg_scores',
          'Emotional_pos_scores', 'Emotional_neg_scores', 'Social_pos_scores', 'Social_neg_scores',
          'Monetary_pos_scores', 'Monetary_neg_scores', 'Safety_pos_scores', 'Safety_neg_scores']]
y = data['Rating_D']

# Display means for the independent variables, rounded to two decimal places
means = X.mean().round(2)
std_devs = X.std().round(2)

# Combine means and standard deviations into one DataFrame
stats = pd.concat([means.to_frame('Mean'), std_devs.to_frame('Standard Deviation')], axis=1)
print("Statistics of the independent variables:")
print(stats)

# Add a constant to the independent variable set
X_with_const = sm.add_constant(X)

# Fit the logistic regression model
logit_model = sm.Logit(y, X_with_const)
result = logit_model.fit()

# Print the summary of the logistic regression rounded to two decimal places
print(result.summary2().tables[1].round(2))

# Function to calculate the exponential of the coefficients (Exp(B))
def calculate_exp_b(coefficients):
    return np.exp(coefficients)

# Calculate Exp(B) and the effects of a 0.1 unit increase
exp_b_values = calculate_exp_b(result.params)
effects_0_1 = np.exp(result.params * 0.1)

# Create DataFrames to display Exp(B) values and effects of a 0.1 unit increase in a structured format
exp_b_df = pd.DataFrame({
    "Exp(B)": exp_b_values,
    "Exp(B) of 0.1 Unit Increase": effects_0_1
}).round(2)

print("\nExp(B) values and Exp(B) of a 0.1 Unit Increase on Odds:")
print(exp_b_df)