In [1]:
from typing import List

import instructor
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field, field_validator
from tqdm.auto import tqdm

In [2]:
load_dotenv()

True

In [3]:
OPENAI_MODEL = "gpt-4o-mini"

In [4]:
client = instructor.from_openai(OpenAI())

In [5]:
email_data = pd.read_csv("emails.csv")
email_data.head()

Unnamed: 0,question,category
0,When will my order be delivered?,"Shipping, Order Management"
1,How do I return an item that doesn't fit?,"Returns, Product Information"
2,Is this product available in blue?,Product Information
3,My discount code isn't working. What should I do?,"Promotions, Technical Support"
4,Can I change my shipping address after placing...,"Order Management, Shipping"


In [6]:
categories = email_data["category"].str.split(", ").explode().unique()
categories

array(['Shipping', 'Order Management', 'Returns', 'Product Information',
       'Promotions', 'Technical Support'], dtype=object)

using field validator instead of `List[Literal['Shipping', 'Order Management', 'Returns', 'Product Information',
       'Promotions', 'Technical Support']]`,  as the list of unique categories can change

In [7]:
class Category(BaseModel):
    category: List[str] = Field(..., min_items=1)

    @field_validator("category")
    def validate_categories(cls, v):
        for item in v:
            if item not in categories:
                raise ValueError(
                    f"Invalid category: {item}. Must be one of {categories}"
                )
        return v

In [8]:
email_categories = []

for index, row in tqdm(email_data.iterrows(), total=email_data.shape[0]):
    email = row["question"]
    category = client.chat.completions.create(
        model=OPENAI_MODEL,
        max_retries=3,
        response_model=Category,
        messages=[
            {
                "role": "user",
                "content": f"Classify this email: {email}. Here are the categories to use: {', '.join(categories)}. Each email can have one or more categories.",
            }
        ],
    )

    email_categories.append((email, ", ".join(category.category)))

    email_data.at[index, "predicted_category"] = ", ".join(category.category)

  0%|          | 0/30 [00:00<?, ?it/s]

In [9]:
email_data.head(3)

Unnamed: 0,question,category,predicted_category
0,When will my order be delivered?,"Shipping, Order Management","Shipping, Order Management"
1,How do I return an item that doesn't fit?,"Returns, Product Information","Returns, Order Management"
2,Is this product available in blue?,Product Information,Product Information


In [3]:
email_data.index += 1

In [4]:
email_data.to_csv("emails_with_predicted_categories.csv", index=False)

In [5]:
def compare_categories(row):
    actual = set(row["category"].split(", "))
    predicted = set(row["predicted_category"].split(", "))

    if actual == predicted:
        return "green"
    elif actual.intersection(predicted):
        return "teal"
    else:
        return "red"


email_data["comparison_result"] = email_data.apply(compare_categories, axis=1)

# Calculate the percentage of correct predictions
total_rows = len(email_data)
correct_predictions = len(email_data[email_data["comparison_result"] == "green"])
accuracy_percentage = (correct_predictions / total_rows) * 100


def color_predicted_category(s):
    return ["background-color: " + color for color in email_data["comparison_result"]]


# Create a Styler object (excluding the comparison_result column)
display_email_data = email_data.drop(columns=["comparison_result"])
styled_email_data = display_email_data.style.apply(
    color_predicted_category, axis=0, subset=["predicted_category"]
)

# Display the accuracy percentage
print(f"Percentage of correctly predicted categories: {accuracy_percentage:.2f}%")


# Save the styled DataFrame to an HTML file
styled_email_data.to_html("category_comparison_result.html")

Percentage of correctly predicted categories: 73.33%


In [6]:
# green good, teal okay, red bad
display(styled_email_data)

Unnamed: 0,question,category,predicted_category
1,When will my order be delivered?,"Shipping, Order Management","Shipping, Order Management"
2,How do I return an item that doesn't fit?,"Returns, Product Information","Returns, Order Management"
3,Is this product available in blue?,Product Information,Product Information
4,My discount code isn't working. What should I do?,"Promotions, Technical Support","Promotions, Technical Support"
5,Can I change my shipping address after placing an order?,"Order Management, Shipping","Shipping, Order Management"
6,How do I track my package?,"Shipping, Order Management","Shipping, Order Management"
7,Are your products cruelty-free?,Product Information,Product Information
8,I received a damaged item. What are the next steps?,"Returns, Order Management","Returns, Order Management"
9,Do you offer international shipping?,"Shipping, Product Information",Shipping
10,How can I cancel my subscription?,Order Management,Order Management
