In [149]:
from mongodb_lib import *
import yaml
import pandas as pd
from collections import defaultdict
from openai_handlers import *
import re
from tqdm import tqdm
import ast

In [150]:
config_infra = yaml.load(open("../infra-config-pipeline.yaml"), Loader=yaml.FullLoader)
db, fs, client = connect_to_mongodb(config_infra)

INFO:root:Successfully connected to MongoDB.


In [197]:
# Import taxonomy

taxonomy = pd.read_excel("../Categories.xlsx")
taxonomy = taxonomy[["Sub-category", "Description & Keywords.1"]]

taxonomy_json = defaultdict(lambda: defaultdict(list))

labels_with_descriptions = list()
description2id = {}

for key, value in zip(taxonomy["Sub-category"], taxonomy["Description & Keywords.1"]):

    entries = value.split("\n")
    assert len(entries) == 2

    labels_with_descriptions.append(key)

In [171]:
data = read_object(fs, "product_textual_lang_summarized")
data = pd.DataFrame(data)
data.fillna("", inplace=True)
data = data[["pdt_product_detail_PRODUCTDESCRIPTION_SUMMARIZED"]]
data.columns = ["description"]

In [178]:
conversation_history = [
    {"role": "system", "content": "Hello! How can I assist you today?"}
]

prompt_template = (
    "You are a multi-label classifier tasked with finding all labels applicable to a product description. "
    "For this, you will receive a list of labels. "
    "Only include a label in your output if it could really be considered a category of the product description. "
    "In the next prompt, I will provide you with a list of texts, and you should return a Python list of lists "
    "with all applicable labels for each product description. If no labels apply to a product description, return an empty list ([]). "
    "Provide your response as a Python list of lists with the results, where each inner list corresponds to the labels for each text provided, in the same order. "
    "In your answer, ONLY return a Python list with the results, nothing else. "
    f"Here are the possible labels:\n{list_sub_categories}. "
    "Are you ready to begin?"
)

initial_prompt = prompt_template

result = query_gpt_with_history(apikey, initial_prompt, conversation_history)
result = result.choices[0].message.content
conversation_history.append({"role": "user", "content": initial_prompt})
conversation_history.append({"role": "system", "content": result})
print(result)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Yes, I'm ready to begin. Please go ahead and provide me with the list of texts for which you want me to determine the applicable labels.


In [187]:
texts = data['description'].tolist()[:50]

batch_size = 5
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

list_results = []

for batch in tqdm(batches):

    batch_prompt = (
        f"Here is the list of product descriptions: {batch}. "
        "Please provide the applicable labels for each description."
    )
    result = query_gpt_with_history(apikey, batch_prompt, conversation_history)

    try:
        result_text = ast.literal_eval(result.choices[0].message.content)
    except Exception as e:
        result_text = [[] for _ in range(batch_size)]

    if len(result_text) != len(batch):
        result_text = [[] for _ in range(batch_size)]

    list_results.extend(result_text)

  0%|          | 0/10 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 10%|█         | 1/10 [00:01<00:15,  1.68s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 20%|██        | 2/10 [00:04<00:19,  2.39s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 30%|███       | 3/10 [00:06<00:13,  1.99s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 40%|████      | 4/10 [00:07<00:10,  1.74s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 50%|█████     | 5/10 [00:09<00:08,  1.68s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 60%|██████    | 6/10 [00:11<00:08,  2.07s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 70%|███████   | 7/10 [00:14<00:06,  2.14s/it]INF

In [190]:
data_example = data[:50]

In [191]:
data_example["labels_batch"] = list_results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_example["labels_batch"] = list_results


In [192]:
data_example.to_excel("openai_annotation_example_4.xlsx", index=None)

# Zero shot

In [273]:
from transformers import pipeline
text = "Activity description: This museum has a vast collection of paintings and sculptures."
hypothesis_template = "This activity description can belong to the category {}"
classes_verbalized = ["History Museums: Historical artifacts, ancient civilizations, historical events, cultural heritage."]
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0")
output = zeroshot_classifier(text, classes_verbalized, hypothesis_template=hypothesis_template, multi_label=False)
print(output)

{'sequence': 'Activity description: This museum has a vast collection of paintings and sculptures.', 'labels': ['History Museums: Historical artifacts, ancient civilizations, historical events, cultural heritage.'], 'scores': [0.7074357867240906]}
