<a href="https://colab.research.google.com/github/VictorIM1/veridion-classifier/blob/main/label_description_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Insurance Label Descriptions & Embedding Workflow
Cod complet pentru:
- Generarea de descrieri pentru etichete din taxonomie
- Salvare incrementală în JSON
- Generare embeddings cu all-MiniLM-L6-v2
- Export CSV pentru utilizare ulterioară


In [1]:
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from pathlib import Path


In [2]:
taxonomy_path = Path("insurance_taxonomy.csv")
labels = pd.read_csv(taxonomy_path)["label"].tolist()

gpt_model_name = "gpt2"
gpt_tokenizer = AutoTokenizer.from_pretrained(gpt_model_name)
gpt_model = AutoModelForCausalLM.from_pretrained(gpt_model_name)
gpt_model.eval()
if torch.cuda.is_available():
    gpt_model.to("cuda")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
def generate_description(label):
    prompt = f"Describe the service called '{label}' in a clear and neutral way."
    inputs = gpt_tokenizer(prompt, return_tensors="pt").to(gpt_model.device)
    outputs = gpt_model.generate(
        **inputs,
        max_new_tokens=60,
        do_sample=True,
        top_p=0.95,
        temperature=0.9,
        pad_token_id=gpt_tokenizer.eos_token_id
    )
    result = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result[len(prompt):].strip()


In [4]:
output_path = Path("generated_label_descriptions.json")
descriptions = {}
if output_path.exists():
    descriptions = json.loads(output_path.read_text(encoding="utf-8"))

for idx, label in enumerate(tqdm(labels)):
    if label in descriptions:
        continue
    try:
        descriptions[label] = generate_description(label)
    except Exception as e:
        print(f"Eroare la '{label}': {e}")
        descriptions[label] = ""
    if (idx + 1) % 50 == 0:
        output_path.write_text(json.dumps(descriptions, indent=2, ensure_ascii=False), encoding="utf-8")
        print(f"[SALVAT] {idx + 1} descrieri generate...")
output_path.write_text(json.dumps(descriptions, indent=2, ensure_ascii=False), encoding="utf-8")

 23%|██▎       | 50/220 [00:32<02:04,  1.37it/s]

[SALVAT] 50 descrieri generate...


 45%|████▌     | 100/220 [01:01<01:11,  1.67it/s]

[SALVAT] 100 descrieri generate...


 68%|██████▊   | 150/220 [01:34<00:44,  1.56it/s]

[SALVAT] 150 descrieri generate...


 91%|█████████ | 200/220 [02:04<00:12,  1.66it/s]

[SALVAT] 200 descrieri generate...


100%|██████████| 220/220 [02:18<00:00,  1.59it/s]


67933

In [8]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import json

model = SentenceTransformer("all-MiniLM-L6-v2")

with open("generated_label_descriptions.json", encoding="utf-8") as f:
    descriptions = json.load(f)

labels = list(descriptions.keys())
desc_texts = [descriptions[label] for label in labels]

embeddings = model.encode(desc_texts, show_progress_bar=True)

df_embed = pd.DataFrame(embeddings, index=labels)
df_embed.index.name = "label"
df_embed.to_csv("label_embeddings.csv")
print("✅ label_embeddings.csv a fost salvat cu succes.")


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

✅ label_embeddings.csv a fost salvat cu succes.


In [10]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
from tqdm import tqdm
import json

# Incarca descrierile generate si embeddings pentru labeluri
with open("generated_label_descriptions.json", encoding="utf-8") as f:
    label_descriptions = json.load(f)

label_embeddings_df = pd.read_csv("label_embeddings.csv", index_col="label")
label_embeddings = torch.tensor(label_embeddings_df.values.astype("float32"))
label_names = label_embeddings_df.index.tolist()

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
zero_shot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Incarca companiile
df_comp = pd.read_csv("company_list.csv")
df_comp = df_comp.fillna("")

def build_input_text(row):
    return " ".join([
        row["description"],
        row["business_tags"],
        row["sector"],
        row["category"],
        row["niche"]
    ])

# Embedding pentru companii
company_inputs = df_comp.apply(build_input_text, axis=1).tolist()
company_embeddings = embedding_model.encode(company_inputs, show_progress_bar=True)

# Clasificare
results = []
partial_save_path = "partial_predictions.csv"
final_save_path = "final_predictions.csv"

for i, (embedding, raw_text) in enumerate(tqdm(zip(company_embeddings, company_inputs), total=len(company_inputs))):
    cosine_scores = util.cos_sim(embedding, label_embeddings)[0]
    top_indices = np.argsort(-cosine_scores)[:20]
    candidate_labels = [label_names[idx] for idx in top_indices]

    try:
        zshot_result = zero_shot(raw_text, candidate_labels, multi_label=False)
        top_label = zshot_result["labels"][0]
        score = zshot_result["scores"][0]
    except Exception as e:
        print(f"Eroare zero-shot la index {i}: {e}")
        top_label = ""
        score = 0.0

    results.append({
        "company_index": i,
        "predicted_label": top_label,
        "score": score,
        "candidates": candidate_labels,
        "raw_text": raw_text
    })

    if (i + 1) % 200 == 0:
        pd.DataFrame(results).to_csv(partial_save_path, index=False)
        print(f"✅ Salvate predictii partiale pentru {i + 1} companii...")

pd.DataFrame(results).to_csv(final_save_path, index=False)
print("🎯 Clasificare finalizata. Rezultatele sunt salvate in final_predictions.csv")


Device set to use cuda:0


Batches:   0%|          | 0/297 [00:00<?, ?it/s]

  0%|          | 10/9494 [00:09<2:05:58,  1.25it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  2%|▏         | 200/9494 [03:14<2:41:52,  1.05s/it]

✅ Salvate predictii partiale pentru 200 companii...


  4%|▍         | 400/9494 [06:43<3:22:20,  1.34s/it]

✅ Salvate predictii partiale pentru 400 companii...


  6%|▋         | 600/9494 [10:02<2:24:42,  1.02it/s]

✅ Salvate predictii partiale pentru 600 companii...


  8%|▊         | 800/9494 [13:22<2:28:58,  1.03s/it]

✅ Salvate predictii partiale pentru 800 companii...


 11%|█         | 1000/9494 [16:41<2:15:18,  1.05it/s]

✅ Salvate predictii partiale pentru 1000 companii...


 13%|█▎        | 1200/9494 [19:53<2:25:43,  1.05s/it]

✅ Salvate predictii partiale pentru 1200 companii...


 15%|█▍        | 1400/9494 [23:08<2:36:17,  1.16s/it]

✅ Salvate predictii partiale pentru 1400 companii...


 17%|█▋        | 1600/9494 [26:26<1:52:39,  1.17it/s]

✅ Salvate predictii partiale pentru 1600 companii...


 19%|█▉        | 1800/9494 [29:40<2:14:13,  1.05s/it]

✅ Salvate predictii partiale pentru 1800 companii...


 21%|██        | 2000/9494 [32:53<1:53:42,  1.10it/s]

✅ Salvate predictii partiale pentru 2000 companii...


 23%|██▎       | 2200/9494 [36:18<2:11:50,  1.08s/it]

✅ Salvate predictii partiale pentru 2200 companii...


 25%|██▌       | 2400/9494 [39:38<1:44:14,  1.13it/s]

✅ Salvate predictii partiale pentru 2400 companii...


 27%|██▋       | 2600/9494 [43:03<1:56:01,  1.01s/it]

✅ Salvate predictii partiale pentru 2600 companii...


 29%|██▉       | 2800/9494 [46:32<1:59:15,  1.07s/it]

✅ Salvate predictii partiale pentru 2800 companii...


 32%|███▏      | 3000/9494 [49:48<2:13:17,  1.23s/it]

✅ Salvate predictii partiale pentru 3000 companii...


 34%|███▎      | 3200/9494 [53:02<1:35:31,  1.10it/s]

✅ Salvate predictii partiale pentru 3200 companii...


 36%|███▌      | 3400/9494 [56:31<1:53:11,  1.11s/it]

✅ Salvate predictii partiale pentru 3400 companii...


 38%|███▊      | 3600/9494 [59:46<1:43:44,  1.06s/it]

✅ Salvate predictii partiale pentru 3600 companii...


 40%|████      | 3800/9494 [1:03:11<1:58:20,  1.25s/it]

✅ Salvate predictii partiale pentru 3800 companii...


 42%|████▏     | 4000/9494 [1:06:25<1:30:54,  1.01it/s]

✅ Salvate predictii partiale pentru 4000 companii...


 44%|████▍     | 4200/9494 [1:09:40<1:20:33,  1.10it/s]

✅ Salvate predictii partiale pentru 4200 companii...


 46%|████▋     | 4400/9494 [1:13:03<1:23:50,  1.01it/s]

✅ Salvate predictii partiale pentru 4400 companii...


 48%|████▊     | 4600/9494 [1:16:20<1:38:27,  1.21s/it]

✅ Salvate predictii partiale pentru 4600 companii...


 51%|█████     | 4800/9494 [1:19:39<1:33:29,  1.20s/it]

✅ Salvate predictii partiale pentru 4800 companii...


 53%|█████▎    | 5000/9494 [1:22:59<1:16:32,  1.02s/it]

✅ Salvate predictii partiale pentru 5000 companii...


 55%|█████▍    | 5200/9494 [1:26:19<1:06:09,  1.08it/s]

✅ Salvate predictii partiale pentru 5200 companii...


 57%|█████▋    | 5400/9494 [1:29:33<1:09:18,  1.02s/it]

✅ Salvate predictii partiale pentru 5400 companii...


 59%|█████▉    | 5600/9494 [1:32:51<1:14:32,  1.15s/it]

✅ Salvate predictii partiale pentru 5600 companii...


 61%|██████    | 5800/9494 [1:36:19<1:18:36,  1.28s/it]

✅ Salvate predictii partiale pentru 5800 companii...


 63%|██████▎   | 6000/9494 [1:39:36<53:29,  1.09it/s]

✅ Salvate predictii partiale pentru 6000 companii...


 65%|██████▌   | 6200/9494 [1:42:50<41:56,  1.31it/s]

✅ Salvate predictii partiale pentru 6200 companii...


 67%|██████▋   | 6400/9494 [1:46:09<52:05,  1.01s/it]

✅ Salvate predictii partiale pentru 6400 companii...


 70%|██████▉   | 6600/9494 [1:49:23<50:25,  1.05s/it]

✅ Salvate predictii partiale pentru 6600 companii...


 72%|███████▏  | 6800/9494 [1:52:46<45:12,  1.01s/it]

✅ Salvate predictii partiale pentru 6800 companii...


 74%|███████▎  | 7000/9494 [1:56:04<42:05,  1.01s/it]

✅ Salvate predictii partiale pentru 7000 companii...


 76%|███████▌  | 7200/9494 [1:59:27<46:43,  1.22s/it]

✅ Salvate predictii partiale pentru 7200 companii...


 78%|███████▊  | 7400/9494 [2:02:46<45:38,  1.31s/it]

✅ Salvate predictii partiale pentru 7400 companii...


 80%|████████  | 7600/9494 [2:06:00<39:16,  1.24s/it]

✅ Salvate predictii partiale pentru 7600 companii...


 82%|████████▏ | 7800/9494 [2:09:21<37:19,  1.32s/it]

✅ Salvate predictii partiale pentru 7800 companii...


 84%|████████▍ | 8000/9494 [2:12:40<30:18,  1.22s/it]

✅ Salvate predictii partiale pentru 8000 companii...


 86%|████████▋ | 8200/9494 [2:15:57<30:44,  1.43s/it]

✅ Salvate predictii partiale pentru 8200 companii...


 88%|████████▊ | 8400/9494 [2:19:18<20:34,  1.13s/it]

✅ Salvate predictii partiale pentru 8400 companii...


 91%|█████████ | 8600/9494 [2:22:32<17:18,  1.16s/it]

✅ Salvate predictii partiale pentru 8600 companii...


 93%|█████████▎| 8800/9494 [2:25:54<12:44,  1.10s/it]

✅ Salvate predictii partiale pentru 8800 companii...


 95%|█████████▍| 9000/9494 [2:29:11<10:45,  1.31s/it]

✅ Salvate predictii partiale pentru 9000 companii...


 97%|█████████▋| 9200/9494 [2:32:22<05:46,  1.18s/it]

✅ Salvate predictii partiale pentru 9200 companii...


 99%|█████████▉| 9400/9494 [2:35:42<01:48,  1.16s/it]

✅ Salvate predictii partiale pentru 9400 companii...


100%|██████████| 9494/9494 [2:37:21<00:00,  1.01it/s]


🎯 Clasificare finalizata. Rezultatele sunt salvate in final_predictions.csv
