In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-base"
MODELS_DIR = "/home/nub/Bachelor/bachelor-thesis/models"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir=MODELS_DIR,
    use_fast=True
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    cache_dir=MODELS_DIR,
    torch_dtype="auto",
    local_files_only=True,
    low_cpu_mem_usage=True,
    device_map="auto"
)

def tokenize(prompt):
    # Model will silently truncate above 512 tokens
    model_inputs = tokenizer(
        prompt,
        truncation=True,
        max_length=tokenizer.model_max_length,
        return_tensors="pt",
        return_offsets_mapping=True,
    )
    return model_inputs

In [10]:
import re

def find_docid_component_spans(text, company, year, keyword_pairs):
    spans = {
        "company": [],
        "year": [],
        "keyword": [],
        "structure": [],
    }

    # Build docid regex
    keyword_parts = [re.escape(k1) + "-" + re.escape(k2) for k1, k2 in keyword_pairs]
    full_docid_pattern = (
        re.escape(company) + "-" +
        re.escape(year) + "-" +
        "-".join(keyword_parts)
    )

    # Match full docid string in the text
    docid_match = re.search(full_docid_pattern, text)
    if docid_match:
        start, end = docid_match.span()
        spans["structure"].append((start, end))

        # Now find subspans within the full match
        docid_text = text[start:end]
        offset = start

        # Match company
        comp_match = re.search(re.escape(company), docid_text)
        if comp_match:
            spans["company"].append((offset + comp_match.start(), offset + comp_match.end()))

        # Match year
        year_match = re.search(re.escape(year), docid_text)
        if year_match:
            spans["year"].append((offset + year_match.start(), offset + year_match.end()))

        # Match keyword pairs
        for k1, k2 in keyword_pairs:
            kw_pattern = re.escape(k1) + "-" + re.escape(k2)
            kw_match = re.search(kw_pattern, docid_text)
            if kw_match:
                spans["keyword"].append((offset + kw_match.start(), offset + kw_match.end()))
    else:
        # DocID not found — return empty spans, signal model with high loss elsewhere
        spans["missing"] = True

    return spans


def encode_with_weights(example, tokenizer):
    encoding = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
    )
    
    # Compute character spans of structured docid parts
    docid = example["docid"]
    company, year, *keywords = docid.split("-")
    keyword_pairs = list(zip(keywords[::2], keywords[1::2]))

    # You must find these spans in the original text (example["text"])
    spans = find_docid_component_spans(example["text"], company, year, keyword_pairs)
    # Output: { "company": [(start, end)], "year": [...], ... }

    weights = [0.0] * len(encoding["input_ids"])
    for i, (start, end) in enumerate(encoding["offset_mapping"]):
        if start == end:
            continue
        for category, cat_spans in spans.items():
            for span_start, span_end in cat_spans:
                if start >= span_start and end <= span_end:
                    if category == "company" or category == "year":
                        weights[i] = 5.0
                    elif category == "keyword":
                        weights[i] = 1.0
                    elif category == "structure":
                        weights[i] = 0.5

    encoding["weights"] = weights
    return encoding

docid = "ADI-2009-currency-exposures-libor-changes-forward-foreign"
docid2 = "ADI-2009-currency-exposures-libor-changes-forward-foreign"
text = f"here is {docid2}"
parts = docid.split("-")
company = parts[0]
year = parts[1]
keyword_pairs = [(kw, kw2) for kw, kw2 in zip(parts[2::2], parts[3::2])]

print(len(text))

find_docid_component_spans(text, company, year, keyword_pairs)

65


{'company': [(8, 11)],
 'year': [(12, 16)],
 'keyword': [(17, 35), (36, 49), (50, 65)],
 'structure': [(8, 65)]}

In [2]:
# text = (
#     "Generate a question for the text: Text:"
#     "I went to work on the latest software update. "
#     "Many people are working on it."
#     "It will be released in the next summer."
#     "Speaking of summer, I will go on summer vacation"
#     "I will be going to Germany for 4 days."
#     "Berlin seems to be a nice location at that time of the year."
#     "After the 4 days, I will go to Italy for 2 weeks."
#     "They have the nicest beaches in the summer."
#     "I hope will get a nice tan."
# )

full_docid = "adi-2009-currency-exposures-libor-changes"
company, year, *keywords = full_docid.split("-")
docid = "-".join(keywords)

query = "What is the estimated currency cost for exposures"
structured_query = f"{company}-{year}, {query}"

# Improved version of prompt
instruction = (
    "Answer the question using a document ID in the format: "
    "company-year-keyword-keyword-keyword-keyword."
)

# Final prompt
text = f"{instruction}\nQuestion: {structured_query}\nAnswer:"


model_inputs = tokenize(text)
generated = model.generate(model_inputs["input_ids"].to(model.device), attention_mask=model_inputs["attention_mask"].to(model.device), max_length=64)

print(tokenizer.decode(generated[0], skip_special_tokens=False))

<pad> adi-2009</s>


In [6]:
model_inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])

In [2]:
import pandas as pd

df = pd.read_csv("/home/nub/Bachelor/bachelor-thesis/data/processed/documents.csv")
df["new_id"] = df["document_id"].apply(lambda x: "-".join(x.split("-")[2:]))