In [1]:
import pandas as pd
pd.set_option('display.width', 10000)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")


In [2]:
# CONVO level preprocessing
# Read in data, get the conversation turns, and overall average rating
def process_conversation_data_debug(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    conversations = []
    conversation_id = 1
    conversation_text = []
    avg_rating = []

    for idx, line in enumerate(lines):
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        # print(f"Processing Line {idx + 1}")  # Debug: Print every line being processed
        parts = line.split("\t")
        
        if line.startswith("USER") or line.startswith("SYSTEM"):
            if len(parts) >= 2:
                speaker = parts[0]
                text = parts[1]
                cleaned_text = text.replace("\n", "")
                # print(f"{speaker}{cleaned_text}")
                conversation_text.append(f"{speaker} {cleaned_text}")

                # print(f"{speaker} {text}")

        if line.startswith("USER\tOVERALL"):
            # print(f"Processing OVERALL Line {idx + 1}")  # Debug for OVERALL lines
            overall_ratings = list(map(int, parts[3].split(",")))
            convo_avg = round(sum(overall_ratings) / len(overall_ratings),2)
            
            # Append conversation
            conversations.append({
                "conv_id": conversation_id,
                "conv_text": "\n".join(conversation_text),
                "average_rating": convo_avg
            })
            conversation_id += 1
            conversation_text = []

    # Create DataFrame
    df = pd.DataFrame(conversations)
    return df






In [3]:
file_path = '../data/raw/MWOZ.txt'
conversation_df = process_conversation_data_debug(file_path)
conversation_df

Unnamed: 0,conv_id,conv_text,average_rating
0,1,USER I'm looking for a cheap restaurant in the...,2.75
1,2,"USER Hi, I will be traveling to Cambridge and ...",2.67
2,3,USER I am looking for a cheap two star hotel i...,3.00
3,4,USER Can you recommend a good restaurant in th...,3.50
4,5,USER I need a place to stay in Cambridge that ...,3.33
...,...,...,...
995,996,USER I'm looking for a cheap hotel.\nSYSTEM al...,2.75
996,997,USER I'm looking for a Lebanese restaurant tha...,3.00
997,998,"USER I wanted to visit a place called Center, ...",3.67
998,999,USER Could you help me find a restaurant that'...,3.50


In [4]:
def add_token_count_column(conversation_df):
    """
    Adds a new column to the DataFrame with the token count of the text in the 'text' column.

    Args:
        conversation_df (pd.DataFrame): A DataFrame containing a 'text' column with string data.

    Returns:
        pd.DataFrame: The input DataFrame with an additional column 'token_count',
                      containing the number of tokens in each row of the 'text' column.
    """
    # Tokenize the text and count the number of tokens
    conversation_df['token_count'] = conversation_df['conv_text'].apply(lambda x: len(str(x).split()))
    return conversation_df


# Add the token count column
conversation_df = add_token_count_column(conversation_df)

# Display the updated DataFrame
print(conversation_df)


     conv_id                                          conv_text  average_rating  token_count
0          1  USER I'm looking for a cheap restaurant in the...            2.75          173
1          2  USER Hi, I will be traveling to Cambridge and ...            2.67          468
2          3  USER I am looking for a cheap two star hotel i...            3.00          313
3          4  USER Can you recommend a good restaurant in th...            3.50          219
4          5  USER I need a place to stay in Cambridge that ...            3.33          334
..       ...                                                ...             ...          ...
995      996  USER I'm looking for a cheap hotel.\nSYSTEM al...            2.75          336
996      997  USER I'm looking for a Lebanese restaurant tha...            3.00          343
997      998  USER I wanted to visit a place called Center, ...            3.67          570
998      999  USER Could you help me find a restaurant that'...       

In [5]:
conversation_df = pd.read_csv('../data/output/LLM_ingest/conversation_data.csv')
conversation_df

Unnamed: 0,conv_id,conv_text,average_rating,satisfaction_rating,token_count
0,1,USER I'm looking for a cheap restaurant in the...,2.75,Low,173
1,2,"USER Hi, I will be traveling to Cambridge and ...",2.67,Low,468
2,3,USER I am looking for a cheap two star hotel i...,3.00,Low,313
3,4,USER Can you recommend a good restaurant in th...,3.50,Medium,219
4,5,USER I need a place to stay in Cambridge that ...,3.33,Medium,334
...,...,...,...,...,...
995,996,USER I'm looking for a cheap hotel.\nSYSTEM al...,2.75,Low,336
996,997,USER I'm looking for a Lebanese restaurant tha...,3.00,Low,343
997,998,"USER I wanted to visit a place called Center, ...",3.67,High,570
998,999,USER Could you help me find a restaurant that'...,3.50,Medium,287


In [6]:
conversation_df[(conversation_df['satisfaction_rating']=='High') & (conversation_df['token_count'] <200)]
#210, 301

Unnamed: 0,conv_id,conv_text,average_rating,satisfaction_rating,token_count
209,210,USER Hi I am looking for some info on the Wort...,3.67,High,151
300,301,USER Are there any cheap hotels (not guesthous...,3.67,High,186


In [7]:
pd.set_option('display.width', 100000)

high_example = conversation_df[conversation_df['conv_id']==301]
high_example


Unnamed: 0,conv_id,conv_text,average_rating,satisfaction_rating,token_count
300,301,USER Are there any cheap hotels (not guesthous...,3.67,High,186


In [12]:
conversation_df['average_rating'].quantile([i / 10 for i in range(1, 10)])
quantiles = conversation_df['average_rating'].quantile([i / 10 for i in range(1, 10)])
quantile_0_3 = quantiles[0.3]
quantile_0_9 = quantiles[0.9]
 

np.float64(3.6)

In [8]:
# import nltk
# from nltk.corpus import brown
# nltk.download("brown")
# # Create a frequency distribution from the Brown corpus
# brown_words = brown.words()
# freq_dist = nltk.FreqDist(brown_words)

# # Define thresholds for rare and common words
# common_threshold = 1000  # Common if frequency > 1000
# rare_threshold = 10      # Rare if frequency < 10

# def calculate_rare_common_ratio(text):
#     tokens = text.split()
#     rare_count = sum(1 for token in tokens if freq_dist[token.lower()] < rare_threshold)
#     common_count = sum(1 for token in tokens if freq_dist[token.lower()] > common_threshold)
#     total_count = len(tokens)
#     return {
#         "rare_ratio": rare_count / total_count if total_count else 0,
#         "common_ratio": common_count / total_count if total_count else 0
#     }

# # Apply the function to the text column
# rare_common_ratios = conversation_df["conv_text"].apply(calculate_rare_common_ratio)
# conversation_df["rare_ratio"] = rare_common_ratios.apply(lambda x: x["rare_ratio"])

In [7]:
def create_batch_prompt(batch):
    prompt = "Task: Classify each conversation as 'low', 'medium', or 'high' satisfaction. Here are the conversations:\n\n"
    for i, row in enumerate(batch.itertuples(), start=1):
        prompt += f"{i}. \"{row.conv_text}\"\nClassification:\n\n"
    return prompt

# Create prompt for the first 3 rows
batch_prompt = create_batch_prompt(conversation_df.head(5))
print(batch_prompt)

Task: Classify each conversation as 'low', 'medium', or 'high' satisfaction. Here are the conversations:

1. "USER I'm looking for a cheap restaurant in the east part of town.
SYSTEM the missing sock is a nice restaurant in the east part of town in the cheap price range
USER What is the address and phone number?
SYSTEM The address of The Missing Sock is Finders Corner Newmarket Road and the phone number is 01223 812660. May I help you with anything else today?
USER Does this restaurant have highchairs for babies?
SYSTEM I'm afraid there is no high chair seating available here. You can contact restaurant to see if they will allow you to bring your own.
USER Does this restaurant offer delivery services?
SYSTEM I'm sorry, they do not offer delivery services. Anything else I can do for you?
USER Does The Missing Sock have vegetarian options?
SYSTEM No vegetarian options are currently at THE MISSING SOCK. Do you need anything else?
USER No, thank you, I have everything I need. Thank you and

In [12]:
# Total tokens per conversation = 500 + 50 = 550
# Safe batch size = 4096 / 550 ≈ 7 conversations per batch


In [8]:
import openai
import tiktoken

# Process:
* Tiktoken to estimate batch size
* When we hit a cutoff, onto the next batch
* Parallel thread the batches
* Outputs as conversation number. classification
  

In [9]:
# pip show openai

In [9]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [10]:
import openai

def classify_batch_conversations(batch_prompt, model="gpt-3.5-turbo"):
    """
    Classify conversations from a batch prompt as 'low', 'medium', or 'high' satisfaction.
    
    Parameters:
        batch_prompt (str): The input batch prompt containing multiple conversations.
        model (str): The OpenAI model to use for classification (default: "gpt-3.5-turbo").
    
    Returns:
        dict: A dictionary with conversation numbers as keys and classification or error reasons as values.
    """
    results = {}
    client = OpenAI()
    try:
        # Send the entire batch_prompt to the OpenAI API without additional context
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": batch_prompt}
            ],
            max_tokens=100
        )

        from openai import OpenAI
client = OpenAI()
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": "write a haiku about ai"}
    ]
)
        # response = openai.ChatCompletion.create(
        #     model=model,
        #     messages=[
        #         {"role": "user", "content": batch_prompt}
        #     ]
        # )
        
        # Extract the response content
        classifications = response["choices"][0]["message"]["content"].strip()
        
        # Parse the classifications by conversation number
        for line in classifications.split("\n"):
            if line.strip():  # Skip empty lines
                if ":" in line:
                    convo_number, classification = line.split(":", 1)
                    convo_number = convo_number.strip()
                    classification = classification.strip().lower()
                    if classification in ["low", "medium", "high"]:
                        results[convo_number] = classification
                    else:
                        results[convo_number] = f"Error: Unexpected classification output '{classification}'"
                else:
                    # Handle improperly formatted lines
                    results["Unknown"] = f"Error: Malformed response line '{line}'"

    except Exception as e:
        # Handle any API or parsing errors
        results["Error"] = f"API error: {str(e).split(':', 1)[-1].strip()}"

    return results

batch_prompt = create_batch_prompt(conversation_df.head(5))

# Call the function
results = classify_batch_conversations(batch_prompt, model="gpt-3.5-turbo")
for convo, result in results.items():
    print(f"Conversation {convo}: {result}")


Conversation Error: API error: //github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [113]:
# def calculate_model_costs(conversation_df):
#     """
#     Calculates the cost of processing conversations with GPT-3.5-turbo and Text-Davinci-003
#     based on token counts.

#     Args:
#         conversation_df (pd.DataFrame): A DataFrame with columns:
#             - 'conversation_text': The text of the conversation.
#             - 'token_count': The number of tokens in the conversation.

#     Returns:
#         pd.DataFrame: The input DataFrame with two new columns:
#             - 'cost_gpt3.5_turbo': The cost for processing the conversation with GPT-3.5-turbo.
#             - 'cost_text_davinci_003': The cost for processing the conversation with Text-Davinci-003.
#     """
#     # Define costs per 1,000 tokens
#     gpt3_5_turbo_cost_per_1k = 0.0015  # $0.0015 per 1,000 tokens
#     text_davinci_003_cost_per_1k = 0.02  # $0.02 per 1,000 tokens

#     # Calculate costs
#     conversation_df['cost_gpt3.5_turbo'] = (conversation_df['token_count'] / 1000) * gpt3_5_turbo_cost_per_1k
#     conversation_df['cost_text_davinci_003'] = (conversation_df['token_count'] / 1000) * text_davinci_003_cost_per_1k

#     # Round costs to 4 decimal places for readability
#     conversation_df['cost_gpt3.5_turbo'] = conversation_df['cost_gpt3.5_turbo'].round(4)
#     conversation_df['cost_text_davinci_003'] = conversation_df['cost_text_davinci_003'].round(4)

#     return conversation_df




In [114]:
conversation_df = calculate_model_costs(conversation_df)
conversation_df

Unnamed: 0,conv_id,conv_text,average_rating,token_count,cost_gpt3.5_turbo,cost_text_davinci_003
0,1,USER I'm looking for a cheap restaurant in the...,2.75,173,0.0003,0.0035
1,2,"USER Hi, I will be traveling to Cambridge and ...",2.67,468,0.0007,0.0094
2,3,USER I am looking for a cheap two star hotel i...,3.00,313,0.0005,0.0063
3,4,USER Can you recommend a good restaurant in th...,3.50,219,0.0003,0.0044
4,5,USER I need a place to stay in Cambridge that ...,3.33,334,0.0005,0.0067
...,...,...,...,...,...,...
995,996,USER I'm looking for a cheap hotel.\nSYSTEM al...,2.75,336,0.0005,0.0067
996,997,USER I'm looking for a Lebanese restaurant tha...,3.00,343,0.0005,0.0069
997,998,"USER I wanted to visit a place called Center, ...",3.67,570,0.0009,0.0114
998,999,USER Could you help me find a restaurant that'...,3.50,287,0.0004,0.0057
