In [None]:
from retrieval import Retriever
from igt import IGT
from datasets import Dataset
import json
from transformers import T5ForConditionalGeneration, ByT5Tokenizer

In [None]:
# helper method to transform igt instances in a file into igt object. igt object is a class used for the retrieve method

def parse_igt(file_path):

    lines = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            stripped = line.strip()
            if stripped:
                lines.append(stripped)

    igts = []
    i = 0
    while i < len(lines):
        if lines[i].startswith("\\t"):
            transcription = lines[i][2:].strip()
            if lines[i+1].startswith("\\m"):               # open track file process
                morpheme = lines[i+1][2:].strip()
                gloss = lines[i+2][2:].strip()
                translation = lines[i+3][2:].strip()
                igts.append(IGT(
                    transcription=transcription,
                    glosses=gloss,
                    morpheme=morpheme,
                    translation=translation,
                    language="Gitksan",
                    metalang="English"
                ))
                i += 4

            else:           # close track file process
                gloss = lines[i+1][2:].strip()
                translation = lines[i+2][2:].strip()
                igts.append(IGT(
                    transcription=transcription,
                    glosses=gloss,
                    morpheme=None,
                    translation=translation,
                    language="Gitksan",
                    metalang="English"
                ))
                i += 3
        else:
            i += 1
    return igts

In [None]:
# example usage
train_file = "polygloss/data/raw/sigmorphon_st/Lezgi/lez-train-track2-uncovered"   # change input file
train_igts = parse_igt(train_file)

train_files = [igt.__dict__ for igt in train_igts]      # transform into dict
train_dataset = Dataset.from_list(train_files)  # transform into Hugging Face dataset

Code for zero shot experiment

In [None]:
model = T5ForConditionalGeneration.from_pretrained("lecslab/glosslm")
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-base", use_fast=False)

with open("polygloss/data/raw/sigmorphon_st/Lezgi/lez-test-track2-covered", "r", encoding="utf-8") as f:    # change file path
    lines = f.readlines()

igt_blocks = []
cur_block = []

for line in lines:          # store igt instances as single blocks
    if line.startswith("\\t"):
        if cur_block:
            igt_blocks.append(cur_block)
        cur_block = [line]
    else:
        cur_block.append(line)
if cur_block:
    igt_blocks.append(cur_block)

cleaned_blocks = []        # remove starting symbols
for block in igt_blocks:
    transcription = ""
    morpheme = None
    translation = ""
    for line in block:
        if line.startswith("\\t"):
            transcription = line[2:].strip()
        elif line.startswith("\\m"):
            morpheme = line[2:].strip()
        elif line.startswith("\\l"):
            translation = line[2:].strip()
    if transcription and translation:
        cleaned_blocks.append({"transcription": transcription, "morphemes": morpheme, "translation": translation})

prompts = []        # prompts for GlossLM.

for ex in cleaned_blocks:
    if ex["morphemes"]:     # for open track file

        prompt = f"""Provide the glosses for the following segmented transcription in Lezgian.  # change language here

Transcription in Gitxsan: {ex['morphemes']}
Transcription segmented: true
Translation in English: {ex['translation']}

Glosses:
"""
    else:       # for closs track file
        prompt = f"""Provide the glosses for the following transcription in Lezgian.    #change language

Transcription in Gitxsan: {ex['transcription']}
Transcription segmented: false
Translation in English: {ex['translation']}

Glosses:
"""
    prompts.append(prompt)

batch_size = 4
predicted_glosses = []


for i in range(0, len(prompts), batch_size):        # predict
    batch = prompts[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_length=1024, num_beams=1)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_glosses.extend(decoded)

results = []        # final gloss predictions
for block, gloss in zip(igt_blocks, predicted_glosses):
    new_block = []
    gloss_inserted = False
    for line in block:
        if line.startswith("\\g"):
            new_block.append(f"\\g {gloss}\n")
            gloss_inserted = True
        else:
            new_block.append(line)
    results.extend(new_block)

with open("git_open_noex.txt", "w", encoding="utf-8") as f:     # store glosses in a file
    f.writelines(results)


Code for one shot experiment

In [None]:
retriever = Retriever.stock("max_word_coverage", n_examples=1, dataset=train_dataset, seed=42)  
to_gloss = "polygloss/data/raw/sigmorphon_st/Lezgi/lez-test-track2-covered"
test_igts = parse_igt(to_gloss)

model = T5ForConditionalGeneration.from_pretrained("lecslab/glosslm")
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-base", use_fast=False)

prompts = []
for test_igt in test_igts:
    ex = retriever.retrieve(test_igt)[0]
    prompt = f"""Here are some complete glossed examples:

    \t {ex.morpheme}                # change this line to ex.transcription in close track   
    \g {ex.glosses}
    \l {ex.translation}

    Provide the glosses for the following transcription in Lezgian.     # change language

    Transcription in Lezgian: {test_igt.morpheme}       # change to test_igt.transcription in close track
    Transcription segmented: true                   # set to false in close track
    Translation in English: {test_igt.translation}

    Glosses:
    """
    prompts.append(prompt)

batch_size = 4
predicted_glosses = []

print("start glossing")

for i in range(0, len(prompts), batch_size):

    batch = prompts[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask,
                             max_length=1024, num_beams=1)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    predicted_glosses.extend(decoded)

output_path = "/content/lez_open_1ex.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for test_igt, gloss in zip(test_igts, predicted_glosses):
        f.write(f"\\t {test_igt.transcription}\n")
        f.write(f"\\m {test_igt.morpheme}\n")
        f.write(f"\\g {gloss}\n")
        f.write(f"\\l {test_igt.translation}\n\n")