In [104]:
from mongodb_lib import *
import yaml
import pandas as pd
from collections import defaultdict
from openai_handlers import *
import re
from tqdm import tqdm
import os
import pickle
import ast
from transformers import pipeline

In [105]:
config_infra = yaml.load(open("../infra-config-pipeline.yaml"), Loader=yaml.FullLoader)
db, fs, client = connect_to_mongodb(config_infra)

INFO:root:Successfully connected to MongoDB.


In [106]:
# Import taxonomy

taxonomy = pd.read_excel("../Categories.xlsx")

df_categories = taxonomy[["Category", "Description & Keywords"]]
df_categories.drop_duplicates(inplace=True)
categories = list(df_categories["Category"])
categories = [el.split(": ")[1] for el in categories]
categories_descriptions = list(df_categories["Description & Keywords"])
categories_descriptions = [el.split("\n")[0].replace("- Description: ", "") for el in categories_descriptions]
full_labels = [x.lower() + ' and it mentions ' + y.lower() for x, y in zip(categories, categories_descriptions)]

fulllabels2cat = dict(zip(full_labels, categories))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categories.drop_duplicates(inplace=True)


In [107]:
data = read_object(fs, "product_textual_lang_summarized")
data = pd.DataFrame(data)
data.fillna("", inplace=True)

texts = data["pdt_product_detail_PRODUCTDESCRIPTION_SUMMARIZED"].tolist()
product_ids = data["PRODUCTCODE"].tolist()

In [111]:
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/bge-m3-zeroshot-v2.0")

# Iterate through texts and product_ids
for t, p in tqdm(zip(texts, product_ids), total=len(texts)):

    filename = f"../tmp/filtered_labels_{p}.pkl"

    # Check if the file already exists
    if not os.path.exists(filename):

        context = "This is a tour description: "
        text = context + t
        hypothesis_template = "A possible category for this tour description is {}"

        # Perform zero-shot classification
        output = zeroshot_classifier(text, full_labels, hypothesis_template=hypothesis_template, multi_label=True)
        filtered_labels = [label for label, score in zip(output['labels'], output['scores']) if score > 0.85]
        filtered_labels = [fulllabels2cat[el] for el in filtered_labels]

        print(text, filtered_labels)

        with open(filename, 'wb') as f:

            pickle.dump(filtered_labels, f)

  0%|          | 1/21262 [00:23<138:11:50, 23.40s/it]

This is a tour description: This route through Catalonia will be accompanied by the excellent Mediterranean climate. An excellent opportunity to get to know this region, its people, customs and this environment of great natural and cultural value. []
