In [None]:
!pip install unsloth torch transformers datasets accelerate sentencepiece pandas regex

Collecting unsloth
  Downloading unsloth-2026.1.4-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting unsloth_zoo>=2026.1.4 (from unsloth)
  Downloading unsloth_zoo-2026.1.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.5-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.34-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsloth)
  Downloading trl-0.24.0-py3-none-any.whl.metadata (

# Lexicon + Rule Layer

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

True
Tesla T4


In [None]:
import os
import re
import csv
import pandas as pd
import torch

from datasets import Dataset, concatenate_datasets
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer


In [None]:
LANG_COLUMNS = {
    "en": ("Original Terms", "Inclusive Terms"),
    "es": ("Original Terms (Spanish)", "Inclusive Terms (Spanish)"),
    "ta": ("Original Terms (Tamil)", "Inclusive Terms (Tamil)"),
    "kn": ("Original Terms (Kannada)", "Inclusive Terms (Kannada)"),
}

## Lexical builder

In [None]:
def normalize(text):
    return re.sub(r"\s+", " ", str(text).strip().lower())

def load_replacement_lexicon(path, lang):
    src_col, tgt_col = LANG_COLUMNS[lang]
    df = pd.read_csv(path)

    lexicon = {}
    for _, row in df.iterrows():
        src = row.get(src_col)
        tgt = row.get(tgt_col)

        if pd.isna(src) or pd.isna(tgt):
            continue

        lexicon[normalize(src)] = tgt.strip()

    # longest phrase first (prevents partial replacement bugs)
    return dict(sorted(lexicon.items(), key=lambda x: -len(x[0])))


## safe rule-based replacement layer

In [None]:
def apply_lexicon(sentence, lexicon):
    s = sentence
    s_lower = sentence.lower()

    for src, tgt in lexicon.items():
        pattern = r"\b" + re.escape(src) + r"\b"
        if re.search(pattern, s_lower):
            s = re.sub(pattern, tgt, s, flags=re.IGNORECASE)
            s_lower = s.lower()

    return s


In [None]:
LEXICONS = {
    "en": load_replacement_lexicon("/en_replacement_pairs.csv", "en"),
    "es": load_replacement_lexicon("/sp_replacement_pairs.csv", "es"),
    "ta": load_replacement_lexicon("/ta_replacement_pairs.csv", "ta"),
    "kn": load_replacement_lexicon("/ka_replacement_pairs.csv", "kn"),
}


In [None]:
SENTENCE_COLUMNS = {
    # English
    "en": ("non-inclusive", "inclusive"),

    # Spanish (same schema as English in LT-EDI)
    "es": ("non-inclusive", "inclusive"),

    # German
    "de": ("non-inclusive German", "inclusive German"),

    # Tamil (language-specific columns)
    "ta": ("non-inclusive (Tamil)", "inclusive (Tamil)"),

    # Kannada (language-specific columns)
    "kn": ("non-inclusive (Kannada)", "inclusive (Kannada)"),
}


In [None]:
def load_sentence_pairs(path, lang):
    df = pd.read_csv(path)

    if lang not in SENTENCE_COLUMNS:
        raise ValueError(f"No column mapping defined for language: {lang}")

    src_col, tgt_col = SENTENCE_COLUMNS[lang]

    if src_col not in df.columns or tgt_col not in df.columns:
        raise ValueError(
            f"Expected columns ({src_col}, {tgt_col}) in {path}, "
            f"found {list(df.columns)}"
        )

    df = df.rename(columns={
        src_col: "input",
        tgt_col: "output"
    })

    df = df[["input", "output"]]
    df["lang"] = lang

    # Rule-first cleanup using correct lexicon
    df["input"] = df["input"].apply(
        lambda x: apply_lexicon(str(x), LEXICONS.get(lang, {}))
    )

    return Dataset.from_pandas(df)


In [None]:
import pandas as pd

df = pd.read_csv("/content/en_sentence_pairs.csv")
print(df.columns)
df.head()

Index(['non-inclusive', 'inclusive', 'Category'], dtype='object')


Unnamed: 0,non-inclusive,inclusive,Category
0,How many man-hours will this project take?,How many person-hours will this project take?,Gender
1,"If a manager picks up this topic, he should go...","If a manager picks up this topic, they should ...",Gender
2,Each salesman must meet his quota by Friday,Each salesperson must meet their quota by Friday,Gender
3,We require manpower to get this done,We require workforce to get this done,Gender
4,"If an engineer finds a flaw, he must report it","If an engineer finds a flaw, they must report it",Gender


In [None]:
import pandas as pd

df = pd.read_csv("/content/de_sentence_pairs.csv")
print(df.columns)
df.head()

Index(['ID', 'non-inclusive German', 'inclusive German'], dtype='object')


Unnamed: 0,ID,non-inclusive German,inclusive German
0,1,Wie viele Mannmonate wird dieses Projekt in An...,Wie viele Personenmonate wird dieses Projekt i...
1,2,"Wenn ein Manager dieses Thema aufgreift, sollt...","Wenn das Management dieses Thema aufgreift, so..."
2,3,Jeder Verkäufer muss seine Quote bis Freitag e...,Jede Verkaufskraft muss ihre Quote bis Freitag...
3,5,"Wenn ein Ingenieur einen Fehler findet, muss e...","Wenn das Ingenieurwesen einen Fehler findet, m..."
4,6,Jeder Wissenschaftler musste seinen Vorschlag ...,Das gesamte Wissenschaftliche Personal musste ...


In [None]:
import pandas as pd

df = pd.read_csv("/content/ta_sentence_pairs.csv")
print(df.columns)
df.head()

df = pd.read_csv("/content/kn_sentence_pairs.csv")
print(df.columns)
df.head()

Index(['non-inclusive', 'inclusive', 'Category', 'non-inclusive (Tamil)',
       'inclusive (Tamil)'],
      dtype='object')
Index(['non-inclusive', 'inclusive', 'Category', 'non-inclusive (Kannada)',
       'inclusive (Kannada)'],
      dtype='object')


Unnamed: 0,non-inclusive,inclusive,Category,non-inclusive (Kannada),inclusive (Kannada)
0,How many man-hours will this project take?,How many person-hours will this project take?,Gender,ಈ ಯೋಜನೆಯು ಎಷ್ಟು ಮಾನವ-ಗಂಟೆಗಳನ್ನು ತೆಗೆದುಕೊಳ್ಳುತ್...,ಈ ಯೋಜನೆಯು ಎಷ್ಟು ಮಾನವ-ಗಂಟೆಗಳನ್ನು ತೆಗೆದುಕೊಳ್ಳುತ್...
1,"If a manager picks up this topic, he should go...","If a manager picks up this topic, they should ...",Gender,"ಮ್ಯಾನೇಜರ್ ಈ ವಿಷಯವನ್ನು ಎತ್ತಿಕೊಂಡರೆ, ಅದನ್ನು ಪರಿಹ...","ಮ್ಯಾನೇಜರ್ ಈ ವಿಷಯವನ್ನು ಎತ್ತಿಕೊಂಡರೆ, ಅದನ್ನು ಪರಿಹ..."
2,Each salesman must meet his quota by Friday,Each salesperson must meet their quota by Friday,Gender,ಪ್ರತಿಯೊಬ್ಬ ಮಾರಾಟಗಾರನು ತನ್ನ ಕೋಟಾವನ್ನು ಶುಕ್ರವಾರದ...,ಪ್ರತಿ ಮಾರಾಟಗಾರರು ಶುಕ್ರವಾರದೊಳಗೆ ತಮ್ಮ ಕೋಟಾವನ್ನು ...
3,We require manpower to get this done,We require workforce to get this done,Gender,ಇದನ್ನು ಮಾಡಲು ನಮಗೆ ಮಾನವಶಕ್ತಿಯ ಅಗತ್ಯವಿದೆ,ಇದನ್ನು ಮಾಡಲು ನಮಗೆ ಉದ್ಯೋಗಿಗಳ ಅಗತ್ಯವಿದೆ
4,"If an engineer finds a flaw, he must report it","If an engineer finds a flaw, they must report it",Gender,ಇಂಜಿನಿಯರ್ ಲೋಪ ಕಂಡು ಬಂದರೆ ಅದನ್ನು ಅವನು ವರದಿ ಮಾಡಬೇಕು,ಇಂಜಿನಿಯರ್ ಲೋಪ ಕಂಡು ಬಂದರೆ ಅದನ್ನು ಅವರು ವರದಿ ಮಾಡಬೇಕು


In [None]:
datasets = [
    load_sentence_pairs("/content/en_sentence_pairs.csv", "en"),
    load_sentence_pairs("/content/es_sentence_pairs.csv", "es"),
    load_sentence_pairs("/content/de_sentence_pairs.csv", "de"),
    load_sentence_pairs("/content/ta_sentence_pairs.csv", "ta"),
    load_sentence_pairs("/content/kn_sentence_pairs.csv", "kn"),
]

train_dataset = concatenate_datasets(datasets).shuffle(seed=42)


In [None]:
print(train_dataset[10])

{'input': 'எதிர்வரும் காலத்திற்கான புதிய தலைவர்களை தெரிவு செய்ய வேண்டும்.', 'output': 'எதிர்வரும் காலத்திற்கான புதிய தலைவர்களை நாம் தெரிவு செய்ய வேண்டும்.', 'lang': 'ta'}


In [None]:
def format_example(example):
    lang = example.get("lang", "en")
    example["text"] = f"""### Instruction:
Rewrite the sentence using gender-inclusive language.
Do not change meaning.
Do not add new information.
Make the smallest possible change.

### Input:
<lang={lang}> {example['input']}

### Output:
<lang={lang}> {example['output']}"""
    return example

train_dataset = train_dataset.map(format_example, batched=False)
print(train_dataset.column_names)

Map:   0%|          | 0/4419 [00:00<?, ? examples/s]

['input', 'output', 'lang', 'text']


In [None]:
train_dataset = train_dataset.map(format_example)


Map:   0%|          | 0/4419 [00:00<?, ? examples/s]

In [None]:
# load LLaMa-3.1-8B with Unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3.1-8b-instruct",
    max_seq_length=512,
    dtype=torch.bfloat16,
    load_in_4bit=True,
)


==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2026.1.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
def tokenize(batch):
    tokenized = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_dataset = train_dataset.map(
    tokenize,
    batched=True,
    remove_columns=train_dataset.column_names,
)

print(train_dataset.column_names)

Map:   0%|          | 0/4419 [00:00<?, ? examples/s]

['input_ids', 'attention_mask', 'labels']


In [None]:
training_args = TrainingArguments(
    output_dir="./gild_subtaskA_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    optim="adamw_8bit",
    report_to="none",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

model.save_pretrained("gild_subtaskA_lora")
tokenizer.save_pretrained("gild_subtaskA_lora")


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,419 | Num Epochs = 3 | Total steps = 831
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
50,19.3709


In [None]:
FastLanguageModel.for_inference(model)

def generate(sentence, lang):
    sentence = apply_lexicon(sentence, LEXICONS.get(lang, {}))

    prompt = f"""### Instruction:
Rewrite the sentence using gender-inclusive language.
Do not change meaning.
Do not add new information.
Make the smallest possible change.

### Input:
<lang={lang}> {sentence}

### Output:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=0.2,
        top_p=0.9,
        num_beams=4,
        do_sample=False,
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("### Output:")[-1].strip()


In [None]:
test_df = pd.read_csv("data/test/en_test.csv")  # column: text
test_df["prediction"] = test_df["text"].apply(lambda x: generate(x, "en"))

test_df[["prediction"]].to_csv("submission_en.csv", index=False)
