# This code is run on Google Collab! Reproduce the code here for reference

In [None]:
!pip install transformers
!pip install sentence-transformers

In [38]:
collin_college_path = "..\..\public\client_files\collin-college\\collin_college.csv"
taxonomy_path = "..\..\public\client_files\\resource_taxonomy.xml"

In [None]:
import xml.etree.ElementTree as ET

# Load and parse the XML file
tree = ET.parse(taxonomy_path)
root = tree.getroot()

services_xml = root.find("services")
descriptions_xml = services_xml.find("descriptions")

categories = {}
for category in descriptions_xml.findall("category"):
    category_title = category.attrib["title"]
    category_description = category.text.strip()
    categories[category_title] = category_description
print(categories)

resource_tags = {}
for top_level in services_xml.findall("top_level"):
    top_title = top_level.get("title")
    # top_title += ": " + categories[top_title.lower()]
    resource_tags[top_title] = []
    for second_level in top_level.findall("second_level"):
        second_title = second_level.get("title")
        second_data = []
        for third_level in second_level.findall("third_level"):
            third_title = third_level.get("title")
            third_data = []
            for fourth_level in third_level.findall("fourth_level"):
                fourth_title = fourth_level.get("title")
                third_data.append(fourth_title)
            third_data = {third_title: third_data} if third_data else third_title
            second_data.append(third_data)
        second_data = {second_title: second_data} if second_data else second_title
        resource_tags[top_title].append(second_data)

In [None]:
leaf_tags = {}

def add_semantic(path):
    categories = path.split("|")
    top_category = categories[0]
    resource = categories[-1]
    remainings = categories[1:-1]
    semantic = f"This resource is about {resource}"
    if remainings:
        semantic += f", associated with {', '.join(remainings)}"
    semantic += f" and is under {top_category} category."
    return semantic

def extract_leaf_tags(data, path):
    if isinstance(data, list):
        for item in data:
            extract_leaf_tags(item, path)
    elif isinstance(data, dict):
        for key, value in data.items():
            new_path = f"{path}|{key}" if path else key
            extract_leaf_tags(value, new_path)
    else: leaf_tags[data] = add_semantic(f"{path}|{data}")


for top_title, second_levels in resource_tags.items():
    extract_leaf_tags(second_levels, top_title)

print(leaf_tags)

In [46]:
import pandas as pd

df = pd.read_csv(collin_college_path, encoding="ISO-8859-1")
entities = {}
for index, row in df.iterrows():
    entities[row["Name"]] = row["Description"]
print(entities)

In [None]:
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device="cuda:0")

classes = list(leaf_tags.keys())
for name, description in entities.items():
    print(f"Name: {name}")
    print(f"Description: {description}")
    result = classifier(description, classes, multi_label=True)
    combined = list(zip(result["labels"], result["scores"]))
    sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
    good_tags = [label.lower() for label, score in sorted_combined if score > 0.5]
    print("Services: " + ", ".join(good_tags).capitalize())
    print("\n")

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
classes_embeddings = model.encode(classes, convert_to_tensor=True)

for name, description in entities.items():
    print(f"Name: {name}")
    print(f"Description: {description}")
    print("Tags:")

    description_embedding = model.encode(description, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(description_embedding, classes_embeddings)

    combined = list(zip(classes, cosine_scores[0].cpu().numpy()))
    sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
    top_10 = sorted_combined[:10]

    for label, score in top_10:
        label = next((name for name, value in leaf_tags.items() if value == label), None)
        print(f" - {label}: {score:.4f}")
    print("\n")