In [34]:
# Parameters
data_path     = "./custom_data/test_data.parquet"
labels_path   = "./custom_data/test_labels.parquet"
archetype_directory = "/home/omadbek/projects/ArcheType"
run_all_directory = "/home/omadbek/projects/run_all"

In [35]:
#https://colab.research.google.com/drive/1BEZ_qgtVqSmOmCTuhHs7lHiYB5M5_myg?usp=sharing

import pandas as pd
import shutil
from pathlib import Path
import json
#import gzipƒ
from tqdm.auto import tqdm
import subprocess
import time
import re
import requests
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from itertools import chain
import argparse
import os
import sys
from copy import deepcopy
import torch
from retry import retry
import random


In [36]:
!cat /home/omadbek/projects/ArcheType/src/const.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
EST_CHARS_PER_TOKEN=4
MAX_LEN=2000*EST_CHARS_PER_TOKEN
INTEGER_SET = set(r"0123456789,/\+-.^_()[] :")
BOOLEAN_SET = set(["True", "true", "False", "false", "yes", "Yes", "No", "no"])

ARCHETYPE_PATH = "/home/omadbek/projects/ArcheType"
DOTENV_PATH = "/home/omadbek/projects/ArcheType/.env"


In [37]:
cd ..

/home/omadbek/projects


In [38]:
import torch
print(torch.cuda.device_count())      # → 1
print(torch.cuda.get_device_name(0))  # → the one you chose

2
NVIDIA A100-SXM4-40GB


In [39]:
def fix_labels(label, label_set):
  label = label.lower().strip()
  ldm = {k.lower().strip() : v.lower().strip() for k, v in label_set['dict_map'].items()}
  if label_set.get("abbrev_map", -1) != -1:
    lda = {k.lower().strip() : v.lower().strip() for k, v in label_set['abbrev_map'].items()}
    ldares = lda.get(label, "")
    if ldares != "":
      label = ldares
  if label.endswith("/name"):
    label = label[:-5]
  remap = ldm.get(label, -1)
  if remap != -1:
    label = remap
  return label.lower()

In [40]:
#LABELS = ['age', 'case_status', 'contact_setting', 'date', 'gender', 'id',
#       'location', 'medical_boolean', 'occupation', 'outcome', 'symptoms']

LABELS = ['none']

In [41]:
sotab_integer_labels = ["age", "date"]
sotab_float_labels   = []

# everything else must go here
sotab_other_labels = [
  "case_status",
  "gender",
  "id",
  "location",
  "medical_boolean",
  "occupation",
  "outcome",
  "symptoms"
]

sotab_top_hier = {
  "integer": sotab_integer_labels,
  "float":   sotab_float_labels,
  "other":   sotab_other_labels
}

sotab_identifier = ["id"]
sotab_category   = ["gender", "medical_boolean", "outcome"]
sotab_text       = ["location", "symptoms", "occupation"]


sotab_other_hier = {
  "Identifier": sotab_identifier,
  "category":   sotab_category,
  "text":       sotab_text
}

In [42]:
rand_seed=13
EST_CHARS_PER_TOKEN=4
MAX_LEN=2000*EST_CHARS_PER_TOKEN

In [43]:
model_path = "/home/omadbek/projects/alpaca/outputs"

In [44]:
# PROMPTS

def llm_prompts(agent, input_list, options_str = None):

    if agent == "generator":
        # Generate agent
        prompt = f"""
            System: You are an epidemiology data steward.
            
            INSTRUCTIONS:
            • You’ll get a comma-separated list of anonymized tokens (IDs, dates, names, etc.).
            • Invent exactly one **broad**, high-level snake_case label that best summarizes what kind of field this is.
              – Think “date”, “identifier”, “status”, “count”, “category”, etc.
              – Avoid overly specific terms like “case_date_range_2020_july” or “lab_test_ids”.
            • Do NOT copy or overlap any part of the input tokens.
            • Return only that one label, no extra text.
            
            Now it’s your turn:
            Input: {input_list}
            Output:
        """
    else:

        # Contains problems that it too much generalizes and not follows input correctly. For example age, id follows same semantic type, same goes with others

        # Picker agent
        # ORIGINAL PROMPT
        # prompt = f"""
        #             SYSTEM: You are an epidemiology data steward labeling anonymized columns.
        #
        #             INSTRUCTIONS:
        #             • Choose exactly ONE label from the OPTIONS list that best describes the INPUT. Note that INPUT should be relative to the OPTIONS that you have. Don't select label if it's just a little bit matches with INPUT.
        #             • If none OPTIONS apply, you can generate new label that will match the INPUT. The generated label should be in snake_case format.
        #             • **Respond with EXACTLY the label token, no additional words, punctuation, or explanation.**
        #
        #             INPUT: {input_list}
        #             OPTIONS: {options_str}
        #             ANSWER:
        #         """

        # Version 1. Everything became with patient_{label} qute interesting and good
        # prompt = f"""
        #             SYSTEM: You are an epidemiology data steward labeling anonymized columns. Note that you will be working with patient data
        #
        #             INSTRUCTIONS:
        #             • Choose exactly ONE label from the OPTIONS list that best describes the INPUT. Note that INPUT should be relative to the OPTIONS that you have. Don't select label if it's just a little bit matches with INPUT.
        #             • If none OPTIONS apply, you can generate new label that will match the INPUT. The generated label should be in snake_case format.
        #             • **Respond with EXACTLY the label token, no additional words, punctuation, or explanation.**
        #
        #             INPUT: {input_list}
        #             OPTIONS: {options_str}
        #             ANSWER:
        #         """

        #Version 2. Tuned in a way that it know generates other labels as well, and not reuses only one label.
        prompt = f"""
                    SYSTEM: You are an epidemiology data steward labeling anonymized columns. You will work with personal information and need to label columns accordingly.

                    INSTRUCTIONS:
                    • Choose exactly ONE label from the OPTIONS list that best describes the INPUT. Note that INPUT should be relative to the OPTIONS that you have. Do not try to match everything with only one label.
                    • If none OPTIONS apply, you can generate new label that will match the INPUT. The generated label should be in snake_case format.
                    • **Respond with EXACTLY the label token, no additional words, punctuation, or explanation.**

                    INPUT: {input_list}
                    OPTIONS: {options_str}
                    ANSWER:
                """

        # prompt = f"""
        #             SYSTEM: You are a Column Type Annotator for epidemiological datasets. You’ll see anonymized column of patient data and assign each a concise semantic label or generate new one.
        #
        #             INSTRUCTIONS:
        #             • Choose exactly ONE label from the OPTIONS list that best describes the INPUT. Note that INPUT should be relative to the OPTIONS that you have. Do not try to match everything with only one label.
        #             • If none OPTIONS apply, you can generate new label that will match the INPUT. The generated label should be in snake_case format.
        #             • **Respond with EXACTLY the label token, no additional words, punctuation, or explanation.**
        #
        #             INPUT: {input_list}
        #             OPTIONS: {options_str}
        #             ANSWER:
        # """



    return prompt

    

In [45]:
import re
from typing import List, Tuple

from sentence_transformers import SentenceTransformer, util
from rapidfuzz import fuzz

# Load the embedding model once at module scope
_sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')

def _normalize_label(s: str) -> str:
    """
    Lowercase, drop non-alphanumerics,
    and strip a trailing 's' for crude singularization.
    """
    s = s.lower()
    s = re.sub(r'[^a-z0-9]', '', s)
    if s.endswith('s'):
        s = s[:-1]
    return s

def _embed_labels(labels: List[str]):
    """
    Encode a list of labels into normalized sentence‐embeddings.
    """
    return _sent_model.encode(labels, normalize_embeddings=True)

class _FuzzyLabelMatcher:
    def __init__(
        self,
        fixed_labels: List[str],
        embed_threshold: float,
        fuzz_threshold: int
    ):
        self.fixed_labels = fixed_labels
        self.embed_threshold = embed_threshold
        self.fuzz_threshold = fuzz_threshold
        self._update_embeddings()

    def _update_embeddings(self):
        self._embeddings = (
            _embed_labels(self.fixed_labels)
            if self.fixed_labels else
            None
        )

    def _find_match(self, label: str):

        norm_label = label.lower()

        if not self.fixed_labels:
            return None

        # 1) token- and exact-normalize as before…
        for existing in self.fixed_labels:
            tokens = re.split(r'[^a-zA-Z0-9]+', existing.lower())
            if norm_label in tokens:
                return existing

        # 2) compute all fuzzy scores & pick the best
        fuzz_scores = [
            (existing, fuzz.token_sort_ratio(label, existing))
            for existing in self.fixed_labels
        ]
        best_fuzz_label, best_fuzz_score = max(fuzz_scores, key=lambda x: x[1])

        # 3) compute all embed sims & pick the best
        emb     = _sent_model.encode([label], normalize_embeddings=True)[0]
        sims    = util.cos_sim(emb, self._embeddings)[0].tolist()
        best_idx = max(range(len(sims)), key=lambda i: sims[i])
        best_emb_label, best_emb_score = (
            self.fixed_labels[best_idx],
            sims[best_idx],
        )

        # 4) if either passes its threshold, choose the stronger signal
        if best_fuzz_score >= self.fuzz_threshold or best_emb_score >= self.embed_threshold:
            # pick whichever of the two is relatively stronger
            # (you could also prefer embed over fuzz, etc.)
            if best_emb_score >= best_fuzz_score / 100:
                return best_emb_label
            else:
                return best_fuzz_label

        return None

    def resolve(self, orig_label: str) -> str:
        """
        If `orig_label` matches an existing one, return that existing label.
        Otherwise, add `orig_label` to fixed_labels and return it.
        """
        match = self._find_match(orig_label)
        if match:
            return match

        # new label → add and update embeddings
        self.fixed_labels.append(orig_label)
        self._update_embeddings()
        return orig_label

def resolve_label(
    orig_label: str,
    fixed_labels: List[str],
    embed_threshold: float = 0.82,
    fuzz_threshold: int = 90
) -> str:
    """
    Given a newly generated label `orig_label` and a list of
    `fixed_labels`, return a tuple:
      (resolved_label, updated_fixed_labels).

    - If `orig_label` is similar to an existing label, `resolved_label`
      is that existing label.
    - Otherwise, `orig_label` is appended to fixed_labels, and returned.
    """
    print("_FuzzyLabelMatcher list: ", fixed_labels)
    matcher = _FuzzyLabelMatcher(fixed_labels, embed_threshold, fuzz_threshold)
    resolved = matcher.resolve(orig_label)
    return resolved

In [51]:
#fixed = ["location", "age", "patient_ids", "date_of_birth"]

#resolved_label = resolve_label("date_values", fixed, embed_threshold = 0.40, fuzz_threshold = 40)
#print("Resolved label is: ", resolved_label)
#matcher = _FuzzyLabelMatcher(fixed, 0.82, 90)
#print(matcher.remap_label("patient_id", fixed))

_FuzzyLabelMatcher list:  ['location', 'age', 'patient_ids', 'date_of_birth']
Resolved label is:  date_of_birth


In [25]:
from accelerate import infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, \
    T5ForConditionalGeneration, LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline
import langchain
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from sentence_transformers import SentenceTransformer, util
import torch

#sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')


def set_pipeline(k=1):
    pipe = pipeline(
        "text-generation",
        model=base_model,
        tokenizer=tokenizer,
        max_length=MAX_LEN,
        temperature=0.5 * k,
        top_p=0.80 - (0.1 * k),
        repetition_penalty=1.3
    )
    local_llm = HuggingFacePipeline(pipeline=pipe)
    llm_chain = LLMChain(prompt=pt,
                         llm=local_llm
                         )
    return pipe, local_llm, llm_chain


curr_model = ""


def init_model(model):
    curr_model = model
    with torch.no_grad():
        torch.cuda.empty_cache()
    if model == "llama-65b":
        LLAMA_PATH = "/scratch/bf996/text-generation-webui/models/llama-65b-hf"
        MAX_LEN = 2048
        tokenizer = LlamaTokenizer.from_pretrained(LLAMA_PATH)
        config = AutoConfig.from_pretrained(LLAMA_PATH,
                                            torch_dtype=torch.float16,
                                            load_in_8bit=True)
        with init_empty_weights():
            base_model = AutoModelForCausalLM.from_config(config)
        base_model.tie_weights()
        device_map = infer_auto_device_map(base_model, max_memory={0: "60GiB", "cpu": "96GiB"})
        base_model = load_checkpoint_and_dispatch(
            base_model,
            LLAMA_PATH,
            device_map=device_map
        )
    elif model == "alpaca-13b":
        MAX_LEN = 2048
        tokenizer = LlamaTokenizer.from_pretrained("chavinlo/alpaca-native")
        #tokenizer = LlamaTokenizer.from_pretrained(model_path)
        base_model = LlamaForCausalLM.from_pretrained(
            #model_path,
            "chavinlo/alpaca-native",
            torch_dtype=torch.float16,
            load_in_8bit=True,
            device_map='auto',
        )
    elif model == "alpaca-fine-tuned":
        MAX_LEN = 2048
        base_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    elif model == "vicuna-13b":
        MAX_LEN = 2048
        tokenizer = AutoTokenizer.from_pretrained("eachadea/vicuna-13b")
        base_model = AutoModelForCausalLM.from_pretrained(
            "eachadea/vicuna-13b",
            torch_dtype=torch.float16,
            load_in_8bit=True,
            device_map='auto',
        )
    elif model == "gpt4-x-alpaca":
        MAX_LEN = 2048
        tokenizer = AutoTokenizer.from_pretrained("chavinlo/gpt4-x-alpaca")
        base_model = AutoModelForCausalLM.from_pretrained("chavinlo/gpt4-x-alpaca", device_map="auto",
                                                          load_in_8bit=True)
    elif model == "t0pp":
        MAX_LEN = 512
        tokenizer = AutoTokenizer.from_pretrained("bigscience/T0pp")
        base_model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto",
                                                           torch_dtype=torch.float16, load_in_8bit=True)
    elif model == "flan-t5-xxl":
        MAX_LEN = 512
        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
        base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xxl", device_map="auto",
                                                           torch_dtype=torch.float16, load_in_8bit=True)
    elif model == "flan-ul2":
        MAX_LEN = 512
        base_model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2", torch_dtype=torch.bfloat16,
                                                                device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
    elif model == "galpaca-30b":
        MAX_LEN = 2048
        tokenizer = AutoTokenizer.from_pretrained("GeorgiaTechResearchInstitute/galpaca-30b", device_map="auto",
                                                  torch_dtype=torch.float16, load_in_8bit=True)
        base_model = AutoModelForCausalLM.from_pretrained("GeorgiaTechResearchInstitute/galpaca-30b")
    elif model == "opt-iml-max-30b":
        MAX_LEN = 2048
        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-iml-max-30b", use_fast=False, padding_side='left')
        base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-iml-max-30b", device_map="auto",
                                                          torch_dtype=torch.float16)
    if model in ["flan-t5-xxl", "t0pp", "flan-ul2"]:
        template = """{instruction}"""
    else:
        template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

        ### Instruction: 
        {instruction}

        Answer:"""
    pt = PromptTemplate(template=template, input_variables=["instruction"])
    #Convert length from tokens to characters, leave room for model response
    MAX_LEN = MAX_LEN * EST_CHARS_PER_TOKEN - 200
    return base_model, tokenizer, template, pt, MAX_LEN

In [26]:
def get_sherlock_resp(df, gt_df, prompt_dict, model, label_indices, base_prompt, lsd):
    isd4 = "d4" in lsd['name']
    if "sherlock" in model:
        model = sherlock_model
        data_m = pd.Series(df[label_indices].astype(str).T.values.tolist())
        extract_features(
            "../temporary.csv",
            data_m
        )
        feature_vectors = pd.read_csv("../temporary.csv", dtype=np.float32)
        predicted_labels = model.predict(feature_vectors, "sherlock")
        iter_len = len(data_m)
    elif "doduo" in model:
        model = doduo_model
        data_m = df[label_indices]
        try:
            annot_m = doduo_model.annotate_columns(data_m)
            predicted_labels = annot_m.coltypes
        except Exception as e:
            print(f"Exception {e} in Doduo, returning default \n")
            predicted_labels = ["text" for i in range(len(data_m))]
        iter_len = len(predicted_labels)
    predicted_labels_dict = {i: sherlock_to_cta.get(predicted_labels[i], [predicted_labels[i]]) for i in
                             range(iter_len)}

    for idx, label_idx in zip(range(iter_len), label_indices):
        prompt = base_prompt + "_" + str(label_idx)
        if isd4:
            ans = predicted_labels[0]
            label = [s.lower() for s in lsd['d4_map'][gt_df]]
        else:
            gt_row = gt_df[gt_df['column_index'] == label_idx]
            if len(gt_row) != 1:
                continue
            label = fix_labels(gt_row['label'].item(), lsd)
            ans = [fix_labels(item, lsd) for item in predicted_labels_dict[idx]]
        if isd4:
            res = ans in label
        else:
            assert isinstance(ans, list), "ans should be a list"
            res = label in ans
        ans_dict = {"response": ans, "context": None, "ground_truth": label, "correct": res,
                    "orig_model_label": predicted_labels[idx]}
        prompt_dict[prompt] = ans_dict
    return prompt


@retry(Exception, tries=3, delay=3)
def get_chatgpt_resp(lsd: dict, context: str, ground_truth: str, prompt_dict: dict, response=True, session=None,
                     method=["similarity"], max_len=15000):
    fixed_labels = [fix_labels(s, lsd) for s in lsd['label_set']]
    model = "gpt-3.5"
    context_labels = ", ".join(fixed_labels)
    fixed_labels = sorted(fixed_labels, key=len, reverse=True)
    prompt = prompt_context_insert(context_labels, context, max_len, "gpt-3.5")
    d_p = prompt_dict.get(prompt, -1)
    if d_p != -1 and "skip-existing" in method:
        #recompute_results(prompt_dict, prompt, model, cbc_pred=None, label_set=lsd)
        return prompt
    elif d_p != -1:
        while prompt_dict.get(prompt, -1) != -1:
            prompt = prompt + "*"
    if response:
        ans = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt},
            ],
            temperature=0,
        ).choices[0]['message']['content']
        #print(f"Original ans is {ans}")
    ans_n = fuzzy_label_match(ans, fixed_labels, None, None, prompt, lsd, model, method=method)
    #print(f"Fuzzy ans is {ans_n}")
    res = ans_n == ground_truth
    ans_dict = {"response": ans_n, "context": context, "ground_truth": ground_truth, "correct": res,
                "original_model_answer": ans}
    prompt_dict[prompt] = ans_dict
    return prompt


@retry(Exception, tries=5, delay=3)
def get_ada_resp(lsd: dict, context: str, ground_truth: str, prompt_dict: dict, response=True, session=None):
    prompt = prompt_context_insert(context_labels, context, MAX_LEN, "ada-personal")
    if prompt_dict.get(prompt, -1) != -1:
        #recompute_results(prompt_dict, prompt, "ada-personal", label_set=lsd)
        return prompt
    if response:
        proc = subprocess.run(
            ["openai", "api", "completions.create", "-m", "ada:ft-personal:-2023-03-14-11-52-45", "-M", "3", "-p",
             prompt], capture_output=True, check=True)
        ans = proc.stdout.decode("utf-8")[len(prompt):].strip()
    else:
        ans = ""
    res = ans.lower().strip().startswith(ground_truth)
    ans_dict = {"response": ans, "context": context, "ground_truth": ground_truth, "correct": res}
    prompt_dict[prompt] = ans_dict
    return prompt

def call_llama_model(session, link, prompt, lsd, var_params, generated_labels_list=None):
    # Build the payload expected by the new LLaMA endpoint
    payload = {
        "model":      "llama3.1:8b-instruct-q8_0",
        "prompt":     prompt,
        "max_tokens": 30,
        "stream":     False
    }

    # Choose session-based or direct requests call
    client = session or requests
    resp = client.post(link, json=payload)
    resp.raise_for_status()

    # Extract the generated text
    data = resp.json()
    text = data.get("response", "")

    #if text == "none":
    #    return generate_label(session, link, prompt, lsd, var_params)
    #else:

    print("LLama model was called")
    #embed_threshold = 0.30
    #fuzz_threshold = 30
    #matcher = _FuzzyLabelMatcher(label_set, embed_threshold, fuzz_threshold)

    return resolve_label(text.strip(), generated_labels_list, embed_threshold = 0.40, fuzz_threshold = 40)
    
    #return fix_labels(text.strip(), generated_labels_list)

temperature = 0
top_p = 0

def extract_answer(orig_ans: str) -> str:
    """
    If orig_ans contains 'ANSWER:...', return the text after the colon.
    Otherwise, return orig_ans unchanged (stripped).
    """
    m = re.search(r"ANSWER\s*:\s*(.*)", orig_ans, re.IGNORECASE)
    if m:
        return m.group(1).strip()
    return orig_ans.strip()

#generated_labels_list = []

def generate_label(session, link, old_prompt, lsd, var_params, generated_labels_list, orig_ans):

    if orig_ans.lower() != "none" and orig_ans not in generated_labels_list:
        generated_labels_list.append(orig_ans)
          
    return picker_prompt, picked

    
    
    

@retry(Exception, tries=3, delay=3)
def get_topp_resp(prompt, k):
    inputs = tokenizer.encode(prompt, return_tensors="pt").cuda()

    temperature = 0.1 * k
    top_p   = 0.90 - (0.1 * k)

    outputs = base_model.generate(inputs,
                                  max_length=MAX_LEN,
                                  #do_sample=False,
                                  #num_beams=1
                                  temperature=temperature,
                                  top_p=top_p,
                                  repetition_penalty=1.3
                                  )
    orig_ans = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return extract_answer(orig_ans)



@retry(Exception, tries=3, delay=3)
def get_llama_resp(lsd: dict, context: list, ground_truth: str, prompt_dict: dict, link: str, response=True,
                   session=None, cbc=None, model="llama", limited_context=None,
                   method=["ans_contains_gt", "gt_contains_ans", "resample"], generated_labels_list = None):
    #print(f"in get llama resp, gt is {ground_truth}, context is {context}")
    isd4 = "d4" in lsd['name']
    if isd4:
        gtv = lsd['d4_map'][ground_truth]
        if isinstance(gtv, str):
            gtv = [gtv]
        ground_truth = [s.lower() for s in gtv]
    if "hierarchical" in method and not isd4:
        dtype = get_base_dtype(limited_context)
        fixed_labels = sotab_top_hier[dtype]
    else:
        fixed_labels = list(set([fix_labels(s, lsd) for s in lsd['label_set']]))
    context_labels = ", ".join(fixed_labels)
    fixed_labels = sorted(fixed_labels, key=len, reverse=True)
    if model in ["llama-zs", "opt-iml-30b-zs"]:
        pipe, local_llm, llm_chain = set_pipeline(k=1)
    prompt = prompt_context_insert(context_labels, context, MAX_LEN, model, options = generated_labels_list)
    d_p = prompt_dict.get(prompt, -1)
    #skip existing logic
    if d_p != -1 and "skip-existing" in method:
        # recompute_results(prompt_dict, prompt, "llama", cbc, lsd)
        return prompt, prompt_dict[prompt]["response"]
    elif d_p != -1:
        while prompt_dict.get(prompt, -1) != -1:
            prompt = prompt + "*"
    #print("GET LLAMA NEETY GREEDY:")
    #print(prompt)
    #response logic
    if not response:
        orig_ans = ans_n = ""
    else:
        orig_ans = apply_basic_rules(limited_context, None)
        if orig_ans is None:
            orig_ans = query_correct_model(model, prompt, context_labels, context, session, link, lsd, generated_labels_list)
            
            #hierarchical matching logic
            if "hierarchical" in method and dtype == "other" and orig_ans not in ['email', 'URL', 'WebHTMLAction',
                                                                                  'Photograph']:
                next_label_set = sotab_other_hier.get(orig_ans, -1)
                if next_label_set == -1:
                    print(f"Original answer {orig_ans} not found in hierarchy")
                    next_label_set = sotab_other_hier['text']
                fixed_labels = list(set([fix_labels(s, lsd) for s in next_label_set]))
                context_labels = ", ".join(fixed_labels)
                fixed_labels = sorted(fixed_labels, key=len, reverse=True)
                orig_ans = query_correct_model(model, prompt, context_labels, context, session, link, lsd, generated_labels_list)
                #fuzzy matching logic
            #print("Fuzzy matching logic")
            #print(f"Fixed LABELSS: {generated_labels_list}")
            #print(f"Fuzzy prompt: {prompt}")
            #print(f"Fuzzy lsd: {lsd}")
            ans_n = orig_ans.lower()

            
    
        else:
            ans_n = orig_ans.lower()

    print(f"LLM Picker 1 answer: {ans_n}... Should be none in the beginning")

    if ans_n.lower() != "none" and ans_n not in generated_labels_list:
        generated_labels_list.append(ans_n)

    print(f"List of label so far: {generated_labels_list}")

 

    #print(f"final label set was {fixed_labels}, prediction was {ans_n}, ground truth was {ground_truth} \n")
    if isd4:
        res = ans_n in ground_truth
    else:
        res = ans_n == ground_truth
        
    ans_dict = {"response": ans_n, "context": context, "ground_truth": ground_truth, "correct": res,
                "original_model_answer": orig_ans}

    prompt_dict[prompt] = ans_dict

    #print(f"Final answer: {ans_n}")
    return prompt, ans_n

@retry(Exception, tries=5, delay=3)
def get_bloomz_resp(lsd: dict, context: str, ground_truth: str, prompt_dict: dict, response=True, session=None):
    prompt = prompt_context_insert(context_labels, context, 2000, "bloomz")
    if prompt_dict.get(prompt, -1) != -1:
        return prompt
    if response:
        inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda:0")
        outputs = model.generate(inputs, max_new_tokens=5)
    else:
        response = ""
    ans = tokenizer.decode(outputs[0]).split()[-1]
    ans = ''.join(e for e in ans if e.isalnum()).lower()
    res = ans == ground_truth
    ans_dict = {"response": ans, "context": context, "ground_truth": ground_truth, "correct": res}
    prompt_dict[prompt] = ans_dict
    return prompt

In [27]:
def to_integer(val):
    return pd.to_numeric(val, downcast='integer', errors='ignore')


def derive_meta_features(col):
    features = {}
    if not col.astype(str).apply(str.isnumeric).all():
        return {"std": round(col.astype(str).str.len().std(), 2), "mean": round(col.astype(str).str.len().mean(), 2),
                "mode": col.astype(str).str.len().mode().iloc[0].item(), "median": col.astype(str).str.len().median(),
                "max": col.astype(str).str.len().max(), "min": col.astype(str).str.len().min(),
                "rolling-mean-window-4": [0.0]}
    col = col.dropna().astype(float)
    if col.apply(float.is_integer).all():
        col = col.astype(int)
    #print(f"Collecting metafeatures for column {col} \n")
    features['std'] = round(col.std(), 2)
    features['mean'] = round(col.mean(), 2)
    features['mode'] = col.mode().iloc[0].item()
    features['median'] = col.median()
    features['max'] = col.max()
    features['min'] = col.min()
    indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=4)
    features['rolling-mean-window-4'] = list(col.rolling(window=indexer, min_periods=1).mean())
    return features


def fix_mode(d):
    if isinstance(d['mode'], pd.Series):
        d['mode'] = d['mode'].loc[0].item()
    return d


def split_meta_features(d):
    return pd.Series(
        [d.get('std', "N/A"), d.get('mean', "N/A"), d.get('median', "N/A"), d.get('mode', "N/A"), d.get('max', "N/A"),
         d.get('min', "N/A")])


def prompt_context_insert(context_labels: str, context: str, max_len: int = 2000, model: str = "gpt-3.5", options: list[str] = None):
    if model == "bloomz":
        s = f'SYSTEM: You are an AI research assistant. You use a tone that is technical and scientific. USER: Please select the field from {context_labels} which best describes the context below. Respond with the name of the field and nothing else. \n CONTEXT: {context}'
    elif model == "gpt-3.5":
        s = f'SYSTEM: Please select the field from {context_labels} which best describes the context. Respond only with the name of the field. \n CONTEXT: {context}'
    elif model == "ada-personal":
        s = f'{context}$'
    elif model == "llama-old":
        s = f'INSTRUCTION: Select the field from the category which matches the input. \n CATEGORIES: {context_labels} \n INPUT:{context} \n OUTPUT: '
    elif "-zs" in model:
        ct = "[" + ", ".join(context).replace("[", "").replace("]", "").replace("'", "")[
                   :MAX_LEN - 100 - len(context_labels)] + "]"
        lb = "\n".join(["- " + c for c in context_labels.split(", ")])
        #s = f'How might one classify the following input? \n INPUT: {ct} .\n OPTIONS:\n {lb} \n ANSWER:'
        if model == "opt-iml-max-30b-zs":
            s = f'Select the option which best describes the input. \n INPUT: {ct} .\n OPTIONS:\n {lb} \n'
        else:
            # Original prompt
            s = f'INSTRUCTION: Select the option which best describes the input. \n INPUT: {ct} .\n OPTIONS:\n {lb} \n ANSWER:'
            
    elif model == "llama":
        ct = "[" + ", ".join(context).replace("[", "").replace("]", "").replace("'", "")[
                   :MAX_LEN - 100 - len(context_labels)] + "]"
        lb = "\n".join(["- " + c for c in context_labels.split(", ")])
        
        #s = f'INSTRUCTION: Select the category which best matches the input. \n INPUT:{context} .\n OPTIONS:\n{lb} \n CATEGORY: '

        #s = f'INSTRUCTION: Select the category which best matches the input. If category is not matching the input return none. Do not provide any further text, only label if it exists or none. \n INPUT:{context} .\n OPTIONS:\n - none \n CATEGORY: '
        #opt_set = options + ["none"]
        
        options_str = " - ".join(options)
        
        #if options is not None:
        #    options_str = " - ".join(options)
        #else:
        #    options_str = "- none"
            
        print(f"Prompt context insert {options_str}")

        # Picker 1 prompt
        s = picker_prompt = llm_prompts("picker", ct, options_str)

    elif model == "llama-retry":
        s = f'INSTRUCTION: Select the category which best matches the input. \n INPUT:{context} \n CATEGORY: '
    #Truncate if prompt exceeds maximum length
    if len(s) > max_len:
        s = s[:max_len - 3]

    return s


def recompute_results(prompt_dict, prompt, model_str, cbc_pred, label_set):
    dict_val = prompt_dict.get(prompt, -1)
    dict_val['cbc_pred'] = cbc_pred
    if model_str == "llama":
        if cbc_pred and (cbc_pred in catboost_cats):
            print(f"using cbcpred label: {cbc_pred} \n")
            dict_val['response'] = fix_labels(cbc_pred, label_set)
        dict_val['correct'] = ((dict_val['ground_truth'] == dict_val['response']) or (
                    dict_val['response'] and (dict_val['response']) in dict_val['ground_truth']))
    prompt_dict[prompt] = dict_val


def make_json(prompt, var_params):
    p = deepcopy(params)
    if var_params:
        for k, v in var_params.items():
            p[k] = v
    return {
        "data": [
            prompt,
            p['max_new_tokens'],
            p['do_sample'],
            p['temperature'],
            p['top_p'],
            p['typical_p'],
            p['repetition_penalty'],
            p['encoder_repetition_penalty'],
            p['top_k'],
            p['min_length'],
            p['no_repeat_ngram_size'],
            p['num_beams'],
            p['penalty_alpha'],
            p['length_penalty'],
            p['early_stopping'],
            p['seed'],
        ]
    }


def ans_contains_gt(ans_n, fixed_labels):
    for fixed_label in fixed_labels:
        if fixed_label in ans_n:
            print(f"Fuzzy label {ans_n} contains gt label {fixed_label}: MATCH \n")
            ans_n = fixed_label
            return ans_n
    return None


def gt_contains_ans(ans_n, fixed_labels):
    if ans_n == "":
        return None
    for fixed_label in fixed_labels:
        if ans_n in fixed_label:
            print(f"GT label {fixed_label} contains fuzzy label {ans_n}: MATCH \n")
            ans_n = fixed_label
            return ans_n
    return None


def basic_contains(ans_n, fixed_labels, method):
    #TODO: not sure the order should be fixed like this, could be made flexible
    if ans_n in fixed_labels:
        return ans_n
    if "ans_contains_gt" in method:
        res = ans_contains_gt(ans_n, fixed_labels)
        if res:
            return res
    if "gt_contains_ans" in method:
        res = gt_contains_ans(ans_n, fixed_labels)
        if res:
            return res
    return None


def fuzzy_label_match(orig_ans, fixed_labels, session, link, prompt, lsd, model,
                      method=["ans_contains_gt", "gt_contains_ans", "resample"]):

    #answer is already in label set, no fuzzy match needed
    ans_n = fix_labels(orig_ans, lsd)
    res = basic_contains(ans_n, fixed_labels, method)
    if res:
        return res
    if "similarity" in method:
        ans_embedding = sent_model.encode(ans_n)
        lbl_embeddings = sent_model.encode(fixed_labels)
        sims = {lbl: util.pytorch_cos_sim(ans_embedding, le) for lbl, le in zip(fixed_labels, lbl_embeddings)}
        return max(sims, key=sims.get)
    if "resample" in method:
        #fuzzy label matching strategy
        for k in range(2, 6):
            if "gpt" in model:
                ans_n = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt},
                    ],
                    temperature=0 + k / 10,
                ).choices[0]['message']['content'].lower()
            elif model in ["llama-zs", "opt-iml-30b-zs"]:
                pipe, local_llm, llm_chain = set_pipeline(k=k)
                ans_n = llm_chain.run(prompt)
            elif model in ["topp-zs", "flan-ul2-zs"]:
                ans_n = get_topp_resp(prompt, k)
            else:
                rep_pen = params['repetition_penalty']
                top_p = params['top_p']
                temp = params['temperature']
                ans_n = call_llama_model(session, link, prompt, lsd,
                                         {'no_repeat_ngram_size': 1, 'top_p': top_p - (0.1 * k), 'temperature': 0.9})
                params['top_p'] = top_p
                params['temperature'] = temp
            res = basic_contains(ans_n, fixed_labels, method)
            if res:
                return res
    #print("Applying fallback label, 'text' \n")
    return 'text'


INTEGER_SET = set(r"0123456789,/\+-.^_()[] :")


def get_base_dtype(context):
    dtype = "integer"
    for item in context:
        if not all(char in INTEGER_SET for char in item):
            #print(f"String is OTHER because: {[char for char in item if char not in INTEGER_SET]}")
            return "other"
        try:
            if item.endswith(".0") or item.endswith(",0"):
                item = item[:-2]
                item = str(int(item))
            if item.endswith(".00") or item.endswith(",00"):
                item = item[:-3]
                item = str(int(item))
        except:
            return "float"
        temp_item = re.sub(r"[^a-zA-Z0-9.]", "", item)
        if not temp_item.isdigit():
            #print(f"string is FLOAT because {temp_item} is not an integer")
            dtype = "float"
    return dtype


def query_correct_model(model, prompt, context_labels, context, session, link, lsd, generated_labels_list = None):
    if model in ["llama-zs", "opt-iml-max-30b-zs"]:
        orig_ans = llm_chain.run(prompt)
        if orig_ans is None:
            prompt = prompt_context_insert(context_labels, context, MAX_LEN, "llama-retry")
            orig_ans = llm_chain.run(prompt)
    elif model in ["topp-zs", "flan-ul2-zs"]:
        orig_ans = get_topp_resp(prompt, 1)
    else:
        orig_ans = call_llama_model(session, link, prompt, lsd, None, generated_labels_list)
        if orig_ans is None:
            prompt = prompt_context_insert(context_labels, context, MAX_LEN, "llama-retry")
            orig_ans = call_llama_model(session, link, prompt, lsd, None, generated_labels_list)
    return orig_ans


def get_df_sample_col(col, rand_seed, len_context, min_variance=2, replace=False):
    df = pd.Series(col)
    ignore_list = ["None", 'none', 'NaN', 'nan', 'N/A', 'na', '']
    sample_list = list(set(p[:75] for p in pd.unique(df.astype(str)[col]) if p not in ignore_list))
    if len(sample_list) < 1:
        return ["None"] * len_context
    if len(sample_list) < len_context:
        sample_list = sample_list * len_context
    if len(sample_list) > len_context:
        sample_list = sample_list[:len_context]
    assert len(sample_list) == len_context, f"An index in val_indices is length {len(sample_list)}"
    return sample_list


def check_substr_contains_only_set(str, acceptable_chars):
    validation = set(str)
    print("Checking if it contains only ", acceptable_chars)
    if validation.issubset(acceptable_chars):
        return True
    else:
        return False


def insert_source(context, fname):
    pattern = r"_([^_]*)_"  # Matches substrings that start and end with "_"
    matcher = re.search(pattern, fname)
    addstr = str(matcher.group()).replace("_", "").split(".")[0]
    #context.insert(0, "SRC_FILE: " + addstr + "COL_VALS: ")
    context.insert(0, "SRC: " + addstr)
    return context


def get_df_sample(df, rand_seed, val_indices, len_context, min_variance=1, replace=False, full=False, other_col=False,
                  max_len=8000):
    column_samples = {}
    ignore_list = ["None", 'none', 'NaN', 'nan', 'N/A', 'na', '']
    for col in df.columns:
        sample_list = list(
            set(p[:max_len // (len_context * 3)] for p in pd.unique(df.astype(str)[col]) if p not in ignore_list))
        #reformat integer samples
        sl_mod = []
        # Meta-features
        if full:
            meta_features = derive_meta_features(df[col])
            meta_features['rolling-mean-window-4'] = meta_features['rolling-mean-window-4'][:5]
        # Sampling from other columns
        if other_col:
            sample_list_fill_size = len_context - len(sample_list)
            nc = len(df.columns)
            per_column_context = max(1, sample_list_fill_size // nc)
            for idx, oc in enumerate(df.columns):
                items = df[oc].astype(str).iloc[0:per_column_context].tolist()
                sample_list = sample_list + ["OC: " + str(item) for item in items]
        if not sample_list:
            sample_list = ["None"]
        if len(sample_list) < len_context:
            sample_list = sample_list * len_context
        if len(sample_list) > len_context:
            sample_list = sample_list[:len_context]
        assert len(sample_list) == len_context, "An index in val_indices is length " + str(len(sample_list))
        if full:
            if meta_features['std'] == "N/A":
                sample_list = sample_list + ["" for k, v in meta_features.items()]
            else:
                sample_list = sample_list + [str(k) + ": " + str(v) for k, v in meta_features.items()]
        # print("sample list")
        # print(sample_list)
        column_samples[col] = sample_list
        # print("column samples")
        # print(column_samples)
    return pd.DataFrame.from_dict(column_samples)


NUMERIC_AND_COMMA = set('0123456789,')

BOOLEAN_SET = ["True", "true", "False", "false", "yes", "Yes", "No", "no"]


def apply_basic_rules(context, lbl):
    if not context:
        return lbl
    if not isinstance(context, list):
        return lbl
    try:
        if all(s.endswith(" g") for s in context):
            lbl = "weight"
        if all(s.endswith(" kg") for s in context):
            lbl = "weight"
        if all(s.endswith(" lb") for s in context):
            lbl = "weight"
        if all(s.endswith(" lbs") for s in context):
            lbl = "weight"
        if all(s.endswith(" pounds") for s in context):
            lbl = "weight"
        if all(s.endswith(" cal") for s in context):
            lbl = "calories"
        if all(s.endswith(" kcal") for s in context):
            lbl = "calories"
        if all(s.endswith(" calories") for s in context):
            lbl = "calories"
        if all("review" in s.lower() for s in context):
            lbl = "review"
        if all("recipe" in s.lower() for s in context):
            lbl = "recipe"
        if lbl and "openopen" in lbl:
            lbl = "openinghours"
        if all(s in BOOLEAN_SET for s in context):
            lbl = "medical_boolean"
        return lbl
    except Exception as e:
        print(f"Exception {e} in apply_basic_rules with context {context}")
        return lbl


def get_cbc_pred(orig_label, numeric_labels):
    try:
        #FOR VALIDATION
        #cbc_filematch = dfv[dfv['df_path'] == str(f)]
        #FOR TEST SET
        cbc_filematch = dft[dft['df_path'] == str(f)]
        cbc_labelmatch = cbc_filematch[cbc_filematch['label'] == orig_label]
        if len(cbc_labelmatch) == 1:
            cbc_pred = numeric_labels[cbc_labelmatch['preds'].item()]
        else:
            cbc_pred = None
    except Exception as e:
        print("cbc excpetion: ")
        print(e)
        cbc_pred = None


def run_val(model: str, save_path: str, inputs: list, label_set: list, input_df: pd.DataFrame, resume: bool = True,
            results: bool = True, stop_early: int = -1, rand_seed: int = 13, sample_size: int = 5, link: str = None,
            response: bool = True, summ_stats: bool = False, table_src: bool = False, other_col: bool = False,
            skip_short: bool = False, min_var: int = 0, method: list = ["similarity"]):
    inputs = [Path(f) for f in inputs]

    infmods = "sherlock" in model or "doduo" in model
    isd4 = "d4" in label_set['name']
    if resume and os.path.isfile(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:
            prompt_dict = json.load(f)
    else:
        prompt_dict = {}
    s = requests.Session()
    if "-zs" in model:
        base_model.eval()
    if isinstance(inputs, dict):
        labels = ["_".join(k.split("_")[:-1]) for k in inputs.keys()]
        inputs = list(inputs.values())
    for idx, f in tqdm(enumerate(inputs), total=len(inputs)):
        if idx % 100 == 0:
            with open(save_path, 'w', encoding='utf-8') as alt_f:
                #print("pd", prompt_dict, "\n")
                json.dump(prompt_dict, alt_f, ensure_ascii=False, indent=4)
        if stop_early > -1 and idx == stop_early:
            break
        if isd4:
            f_df = f
            label_indices = [2]
            gt_labels = labels[idx]
        else:
            gt_labels = input_df[input_df['table_name'] == f.name]
            label_indices = gt_labels['column_index'].unique().tolist()

            if f.suffix.lower() == '.csv':
                f_df = pd.read_csv(f)
            else:
                f_df = pd.read_json(f, compression='infer', lines=True)

        if infmods:
            label_indices = ["values"]
            key = get_sherlock_resp(f_df, gt_labels, prompt_dict, model, label_indices, str(f), label_set)
            continue
        sample_df = get_df_sample(f_df, rand_seed, label_indices, sample_size, full=summ_stats, other_col=other_col,
                                  max_len=MAX_LEN)
        #print(f"in main loop, sample_df is {sample_df}")
        f_df_cols = f_df.columns
        for idx, col in enumerate(f_df_cols):
            if idx not in label_indices:
                continue
            #NOTE: skipping evaluation for columns with insufficient variance in the column
            #       if len(pd.unique(sample_df.astype(str)[col])) < min_var:
            #         continue
            if isd4:
                orig_label = gt_labels
            else:
                gt_row = gt_labels[gt_labels['column_index'] == idx]
                orig_label = gt_row['label'].item()
            label = fix_labels(orig_label, label_set)
            limited_context = sample_df[col].tolist()[:sample_size]
            #NOTE: could consider using min_var here
            #if full and len(pd.unique(sample_df[col].tolist())) < 3:
            if table_src:
                context = insert_source(sample_df[col].tolist(), f.name)
            else:
                context = sample_df[col].tolist()
            if "gpt-3.5" in model:
                key = get_chatgpt_resp(label_set, context, label, prompt_dict, response=response, session=s,
                                       method=method)
            elif "ada-personal" in model:
                key = get_ada_resp(label_set, context, label, prompt_dict, response=response, session=s)
            elif "bloomz" in model:
                key = get_bloomz_resp(label_set, context, label, prompt_dict, response=response, session=s)
            elif "llama" in model or "-zs" in model:
                #cbc_pred = get_cbc_pred(orig_label, numeric_labels)
                cbc_pred = None
                key = get_llama_resp(label_set, context, label, prompt_dict, link=link, response=response, session=s,
                                     cbc=cbc_pred, model=model, limited_context=limited_context, method=method)
                # print("Key: ", key, "\n")
                #print("pdk", prompt_dict[key], "\n")
            prompt_dict[key]['original_label'] = orig_label
            prompt_dict[key]['file+idx'] = str(f) + "_" + str(idx)
    with open(save_path, 'w', encoding='utf-8') as my_f:
        json.dump(prompt_dict, my_f, ensure_ascii=False, indent=4)
    if results:
        results_checker(save_path, skip_duplicates=False)

In [28]:
from sklearn.metrics import classification_report

In [29]:
import json
from statistics import mean

ENDINGS = ["ANSWER:", "CATEGORY:"]


def results_checker_doduo(file_name, skip_duplicates=True):
    with open(file_name, "r") as f:
        d = json.load(f)
    correct = 0
    n = len(d)
    per_class_results = dict()
    for k, v in d.items():
        response_set = set(v["response"])
        for r in response_set:
            per_class_results.setdefault(r, {"TP": 0, "FP": 0, "FN": 0, "Total": 0})
        per_class_results.setdefault(v["ground_truth"], {"TP": 0, "FP": 0, "FN": 0, "Total": 0})
        if v['correct'] == True:
            correct += 1
            per_class_results[v["ground_truth"]]["TP"] += 1
        else:
            per_class_results[v["ground_truth"]]["FN"] += 1
            for r in response_set:
                per_class_results[r]["FP"] += 1
        per_class_results[v["ground_truth"]]["Total"] += 1

    for k, v in per_class_results.items():
        v['F1'] = (2 * v["TP"]) / (2 * v["TP"] + v["FP"] + v["FN"])

    weighted_f1 = sum([v["F1"] * v["Total"] for k, v in per_class_results.items()]) / n
    unweighted_f1 = mean([v["F1"] for k, v in per_class_results.items()])

    print(
        f"Total entries: {n} \n Accuracy: {round(correct / n, 4)} \n Weighted F1: {round(weighted_f1, 4)} \n Unweighted F1: {round(unweighted_f1, 4)}")


def results_checker(file_name, skip_duplicates=True):
    with open(file_name, "r") as f:
        d = json.load(f)

    if skip_duplicates:
        d = {k: v for k, v in d.items() if "CATEGORY: *" not in str(k)}

    # build the lists
    y_true = [v["ground_truth"] for v in d.values()]
    y_pred = [v["response"] for v in d.values()]

    # overall stats
    correct = sum(1 for gt, pred in zip(y_true, y_pred) if gt == pred)
    n = len(y_true)
    print(f"Total entries: {n}")
    print(f"Accuracy:     {correct / n:.4f}\n")

    # per-class report
    print(classification_report(
        y_true,
        y_pred,
        digits=4,  # 4 decimal places
        zero_division=0  # to avoid warnings if a class is never predicted
    ))

    # --- new: build a flattened metrics dict ---
    raw_report = classification_report(
        y_true, y_pred,
        output_dict=True,
        zero_division=0
    )

    flat = {}
    # raw_report has keys for each label, plus 'macro avg', 'weighted avg', and 'accuracy'
    for label, m in raw_report.items():
        if label == "accuracy":
            flat["accuracy"] = m
        else:
            for metric_name, val in m.items():
                flat[f"{label}_{metric_name}"] = val

    # add summary fields
    flat["total_entries"] = n
    # filename identifier: take it from your JSON filename variable
    flat["run_name"]      = os.path.basename(file_name).replace(".json","")

    # convert to one-row DataFrame
    df = pd.DataFrame([flat])

    metrics_csv = f"{archetype_directory}/all_metrics.csv"

    # append (or create) the master CSV
    if not os.path.isfile(metrics_csv):
        df.to_csv(metrics_csv, index=False, float_format="%.4f")
    else:
        df.to_csv(metrics_csv, mode="a", header=False, index=False, float_format="%.4f")

    return df


In [30]:
def missing_entries(f1, f2):
    with open(f1, "r") as file1:
        d1 = json.load(file1)
    with open(f2, "r") as file2:
        d2 = json.load(file2)
    paths1 = set([v["file+idx"] for _, v in d1.items()])
    paths2 = set([v["file+idx"] for _, v in d2.items()])
    return paths1 - paths2

In [31]:
def run_val_parquet(
        model: str,
        save_path: str,
        labels_path: str,
        data_path: str,
        label_set: dict,
        resume: bool = True,
        results: bool = True,
        stop_early: int = -1,
        rand_seed: int = 13,
        sample_size: int = 5,
        link: str = None,
        response: bool = True,
        summ_stats: bool = False,
        table_src: bool = False,
        other_col: bool = False,
        skip_short: bool = False,
        min_var: int = 0,
        method: list = ["similarity"],
        results_checker=None,
        MAX_LEN: int = 1000
):
    """
    Validation loop adapted for parquet-based inputs:

    - labels_path: path to a parquet file with columns ['__index_level_0__', 'type']
    - data_path:   path to a parquet file with columns ['__index_level_0__', 'values']
    - label_set:   dict containing 'name', 'label_set', 'dict_map', 'abbrev_map'

    Each row in the merged DataFrame represents one column to predict:
      - __index_level_0__ (column index)
      - type (ground truth label)
      - values (comma-separated or list of column values)
    """

    # llm output directory
    llm_response_output_path = os.path.join(run_all_directory, "temp-results", "llm-outputs.txt")

    # if directory exist
    os.makedirs(os.path.dirname(llm_response_output_path), exist_ok=True)

    # remove any existing files in the directory
    if os.path.isfile(llm_response_output_path):
        os.remove(llm_response_output_path)

    generated_labels_list = []
        
    
    # Load or initialize cache
    if resume and os.path.isfile(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:
            prompt_dict = json.load(f)
    else:
        prompt_dict = {}
    

    # Read parquet inputs and bring index into a column
    # Read parquet inputs and bring index into a column
    labels_df = pd.read_parquet(labels_path).reset_index()
    data_df = pd.read_parquet(data_path).reset_index()

    # Identify the index column name (either __index_level_0__ or generic index)
    labels_idx_col = '__index_level_0__' if '__index_level_0__' in labels_df.columns else 'index'
    data_idx_col = '__index_level_0__' if '__index_level_0__' in data_df.columns else 'index'

    # Rename for clarity: index → col_idx, type → label, values stays values
    labels_df = labels_df.rename(columns={labels_idx_col: 'col_idx', 'type': 'label'})
    data_df = data_df.rename(columns={data_idx_col: 'col_idx', 'values': 'values'})

    # Remap labels using LABEL_MAP_LC
    # assumes remap_labels(series, mapping) is defined and LABEL_MAP_LC is available

    #labels_df['label'] = remap_labels(labels_df['label'], LABEL_MAP_LC)

    # Filter out __none__ labels
    labels_df = labels_df[labels_df['label'] != "__none__"]

    # Merge on column index
    merged = pd.merge(labels_df, data_df, on='col_idx', how='inner')

    # Prepare session and model
    session = requests.Session()
    if "-zs" in model:
        base_model.eval()

    # Iterate over each column instance
    for idx, row in tqdm(enumerate(merged.itertuples(index=False)), total=len(merged)):
        
        # Periodic cache save
        if idx % 100 == 0:
            with open(save_path, 'w', encoding='utf-8') as f:
                json.dump(prompt_dict, f, ensure_ascii=False, indent=4)
        
        if stop_early > -1 and idx == stop_early:
            break

        col_idx = row.col_idx
        orig_label = row.label
        raw_vals = row.values

        # Parse raw values into a list
        if isinstance(raw_vals, str):
            vals = raw_vals.split(',')
        else:
            vals = list(raw_vals)

        # Deduplicate and sample
        vals = [str(x) for x in vals]
        unique_vals = pd.unique(vals)
        context_list = unique_vals.tolist()[:sample_size]

        # Build context
        if table_src:
            context = insert_source(context_list, str(col_idx))
        else:
            context = context_list

        # Model call
        if "gpt-3.5" in model:
            key = get_chatgpt_resp(label_set, context, orig_label,
                                   prompt_dict, response=response,
                                   session=session, method=method)
        elif "ada-personal" in model:
            key = get_ada_resp(label_set, context, orig_label,
                               prompt_dict, response=response,
                               session=session)
        elif "bloomz" in model:
            key = get_bloomz_resp(label_set, context, orig_label,
                                  prompt_dict, response=response,
                                  session=session)
        else:
            raw_prompt, answer = get_llama_resp(label_set, context, orig_label,
                                 prompt_dict, link=link,
                                 response=response,
                                 session=session,
                                 cbc=None,
                                 model=model,
                                 limited_context=context_list,
                                 method=method,
                                 generated_labels_list = generated_labels_list)
        

        # Record metadata
        prompt_dict[raw_prompt]['original_label'] = orig_label
        prompt_dict[raw_prompt]['file+idx'] = str(col_idx)

        # but you also get to see the actual label answer:
        print("PROMPT SENT:\n", raw_prompt)
        print("MODEL ANSWER:", answer)

        with open(llm_response_output_path, "a", encoding='utf-8') as f:
            f.write(f"---\n")
            f.write(f"PROMPT SENT: {raw_prompt}\n")
            f.write(f"MODEL ANSWER: {answer}\n")

    # Final cache save
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(prompt_dict, f, ensure_ascii=False, indent=4)

    # Optional result summary
    #if results and results_checker is not None:
    #    results_checker(save_path, skip_duplicates=False)

    return prompt_dict

In [32]:
label_set = {
  "name": "custom_csv",     # any string that does NOT contain "d4"
  "label_set": LABELS,      # the list of your labels, used by similarity
  "dict_map": { lab: lab for lab in LABELS },
  "abbrev_map": {}          # or your real abbrev map
}

In [33]:
#model_name = "alpaca-fine-tuned"
#model_name = "flan-ul2"
#model_name = "alpaca-13b"

model_name="llama"

filename = f"custom-data-{model_name}-label-generation.json"

sp = f"{archetype_directory}/custom_data_logs/{filename}"


dirpath = os.path.dirname(sp)
os.makedirs(dirpath, exist_ok=True)

#model_name = "flan-t5-xxl"

#base_model, tokenizer, template, pt, MAX_LEN = init_model(model_name)

# Test set
#labels_path = "/home/omadbek/projects/Sherlock/custom_data/label_generation/test_labels_generation.parquet"
#data_path = "/home/omadbek/projects/Sherlock/custom_data/label_generation/test_data_generation.parquet"


# LLAMA
run_val_parquet(
    model="llama",
    save_path=sp,
    labels_path=labels_path,
    data_path=data_path,
    label_set=label_set,
    method=["similarity"],
    resume=False,
    sample_size=5,
    #stop_early = 30,
    link = "http://localhost:11434/api/generate"
)

  0%|          | 0/55 [00:00<?, ?it/s]

  unique_vals = pd.unique(vals)


Prompt context insert 
LLama model was called
_FuzzyLabelMatcher list:  []
LLM Picker 1 answer: patient_id... Should be none in the beginning
List of label so far: ['patient_id']
PROMPT SENT:
 
                    SYSTEM: You are an epidemiology data steward labeling anonymized columns. You will work with personal information and need to label columns accordingly.

                    INSTRUCTIONS:
                    • Choose exactly ONE label from the OPTIONS list that best describes the INPUT. Note that INPUT should be relative to the OPTIONS that you have. Do not try to match everything with only one label.
                    • If none OPTIONS apply, you can generate new label that will match the INPUT. The generated label should be in snake_case format.
                    • **Respond with EXACTLY the label token, no additional words, punctuation, or explanation.**

                    INPUT: [0C101DE5RTQ2, E15R0T0CQ52D, RC18E0T03Q2D, 03QRTED050C0, 02RE0QT1C4D1]
                 

{'\n                    SYSTEM: You are an epidemiology data steward labeling anonymized columns. You will work with personal information and need to label columns accordingly.\n\n                    INSTRUCTIONS:\n                    • Choose exactly ONE label from the OPTIONS list that best describes the INPUT. Note that INPUT should be relative to the OPTIONS that you have. Do not try to match everything with only one label.\n                    • If none OPTIONS apply, you can generate new label that will match the INPUT. The generated label should be in snake_case format.\n                    • **Respond with EXACTLY the label token, no additional words, punctuation, or explanation.**\n\n                    INPUT: [0C101DE5RTQ2, E15R0T0CQ52D, RC18E0T03Q2D, 03QRTED050C0, 02RE0QT1C4D1]\n                    OPTIONS: \n                    ANSWER:\n                ': {'response': 'patient_id',
  'context': ['0C101DE5RTQ2',
   'E15R0T0CQ52D',
   'RC18E0T03Q2D',
   '03QRTED050C0',
   '02

In [22]:
if os.path.exists(sp):
    print(f"✅ File exists: {sp}")
else:
    print(f"❌ File not found: {sp}")

✅ File exists: /home/omadbek/projects/ArcheType/custom_data_logs/custom-data-llama-label-generation.json


In [23]:
# Fuzzy matcher

# --- Evaluation function ---
def evaluate_and_remap(
    file_name: str,
    embed_threshold: float = 0.25, # cos_threshold (semantic)
    fuzz_threshold: int = 25 # fuzz_threshold (lexical)
):
    """
    Loads your JSON, uses fuzzy matching to snap each 'response' to the closest
    ground-truth label (if similar), and counts fuzzy-correct matches.
    Adds two new fields to each record in the loaded dict:
      - 'resolved_response': the matched label or original response
      - 'correct_fuzzy': bool, whether resolved_response == ground_truth
    Returns the updated data dict and the fuzzy accuracy.
    """
    # 1) Load JSON
    with open(file_name, 'r') as f:
        data = json.load(f)

    # 2) Build matcher over your ground-truth set
    label_set = sorted({v["ground_truth"] for v in data.values()})
    matcher = _FuzzyLabelMatcher(label_set, embed_threshold, fuzz_threshold)

    # 3) Remap each response & compute fuzzy correctness
    total = 0
    fuzzy_correct = 0
    for record in data.values():
        total += 1
        orig = record["response"]
        match = matcher.resolve(orig)
        resolved = match if match is not None else orig
        record["resolved_response"] = resolved
        is_correct = (resolved == record["ground_truth"])
        record["correct_fuzzy"] = is_correct
        if is_correct:
            fuzzy_correct += 1

    # 4) Print fuzzy accuracy
    acc = fuzzy_correct / total if total else 0.0
    total_correct = f"{fuzzy_correct}/{total}" 
    print(f"Fuzzy‐matched correct: {total_correct}  →  Accuracy: {acc:.4f}")

    return total_correct, acc

In [24]:
total_correct, fuzzy_acc = evaluate_and_remap(sp)

Fuzzy‐matched correct: 13/55  →  Accuracy: 0.2364


In [25]:
import json, re, pandas as pd
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util

# load data
data = json.load(open(sp))
rows = []
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')
for rec in data.values():
    gt   = rec["ground_truth"]
    resp = rec["response"]
    # fuzzy
    fuzz_score = fuzz.token_sort_ratio(resp, gt)
    # cosine
    emb_r = model.encode([resp], normalize_embeddings=True)
    emb_g = model.encode([gt],   normalize_embeddings=True)
    cos_score = util.cos_sim(emb_r, emb_g)[0][0].item()
    rows.append({
        "ground_truth": gt,
        "response":     resp,
        "fuzz_score":   fuzz_score,
        "cos_score":    cos_score
    })

df = pd.DataFrame(rows)
print(df.describe())      # summary of scores
print(df.head(20))        # inspect first 20

       fuzz_score  cos_score
count   55.000000  55.000000
mean    38.357247   0.412520
std     20.771964   0.212807
min     22.222222   0.118107
25%     29.340029   0.343116
50%     30.769231   0.348177
75%     38.095238   0.413760
max    100.000000   1.000000
       ground_truth                    response  fuzz_score  cos_score
0                id                 patient_ids   30.769231   0.413760
1   medical_boolean         patient_response_yn   29.411765   0.343116
2           outcome  patient_recovery_status_yn   24.242424   0.260715
3   medical_boolean             medical_boolean  100.000000   1.000000
4              date       date_of_symptom_onset   32.000000   0.412648
5              date       date_of_symptom_onset   32.000000   0.412648
6              date       date_of_symptom_onset   32.000000   0.412648
7              date       date_of_symptom_onset   32.000000   0.412648
8   medical_boolean  patient_recovery_status_yn   29.268293   0.309479
9              date       dat

In [27]:
# Put to txt file

In [29]:
out_path = f"{run_all_directory}/temp-results/fuzzy_result.txt"

out_dir = os.path.dirname(out_path)   # “./temprorary”
os.makedirs(out_dir, exist_ok=True)   # create it (if needed)

with open(out_path, "a") as f:
    f.write("==== ArcheType Output. LLM label generation results ====\n")
    f.write(f"Fuzzy‐matched correct semantic types: {total_correct}"
            f"  →  Accuracy: {fuzzy_acc:.4f}\n")
    f.write("Similarity Scores: \n")
    f.write(df.to_string(index=True))
    f.write("\n")

print(f"✅ Wrote fuzzy results to {out_path}")

✅ Wrote fuzzy results to ./temprorary/fuzzy_result.txt


# Remapping and F1-score

In [None]:
"""
import json
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

data_dir = "../custom_data_logs"

# 1) Load results
with open(sp, 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2) Flatten to DataFrame
records = []
for entry in data.values():
    records.append({
        'file_idx': entry['file+idx'],
        'ground_truth': entry['ground_truth'],
        'predicted': entry['response']
    })
df = pd.DataFrame(records)

print("Unique values: ", sorted(df['predicted'].unique()))
"""

In [None]:
"""
summary = (
    df.groupby('predicted')['ground_truth']
      .agg(['nunique', lambda x: list(x.unique())])
      .rename(columns={'nunique':'# distinct', '<lambda_0>':'values'})
)
print(summary)
"""

In [None]:
"""
# build a dict of {predicted_label: list_of_unique_ground_truths}
unique_vals = {
    cat: df.loc[df['predicted'] == cat, 'ground_truth'].unique().tolist()
    for cat in df['predicted'].unique()
}

# print them out
for cat, vals in unique_vals.items():
    print(f"{cat} ({len(vals)} distinct values):")
    print(vals, "\n")
"""

In [None]:
"""
import json
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# 3) Define mapping dict
label_mapping = {
    # identifiers
    'patient_id':                'id',
    'patient_ids':               'id',

    # yes/no, true/false
    'response_status':          'medical_boolean',
    'medical_boolean':          'medical_boolean',

    # recovery/outcome
    'patient_recovery_status':   'outcome',

    # dates
    'date_range':               'date',

    # gender
    'gender_type':              'gender',

    # locations
    'country_list':             'location',

    # free‐text details
    'diagnosis_details':        'symptoms',
    'role_types':               'occupation',
}

# 4) Apply mapping
df['mapped_predicted'] = df['predicted'].map(label_mapping).fillna(df['predicted'])

# 5) Display DataFrame
#print(df[['file_idx', 'ground_truth', 'predicted', 'mapped_predicted']])

# 6) Classification report
print("Classification Report:")
print(classification_report(df['ground_truth'], df['mapped_predicted'], zero_division=0))
"""