# Setting Up

In [1]:
from transformers import AutoTokenizer, Gemma3ForConditionalGeneration

import torch

  from .autonotebook import tqdm as notebook_tqdm


# Loading the model and tokenizer

In [2]:
GEMMA_PATH = "google/gemma-3-12b-it"

model = Gemma3ForConditionalGeneration.from_pretrained(
    GEMMA_PATH,
    torch_dtype=torch.bfloat16,
    attn_implementation='eager',
    # load_in_8bit=True,
    device_map="auto",
).eval()

tokenizer = AutoTokenizer.from_pretrained(GEMMA_PATH)

Loading checkpoint shards: 100%|██████████| 5/5 [01:23<00:00, 16.62s/it]


# Loading and processing the dataset

In [3]:
from datasets import load_from_disk, Dataset

train_dataset = load_from_disk("processed_data/train")
val_dataset = load_from_disk("processed_data/val")

In [4]:
train_dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 1095
})

In [5]:
print(train_dataset['prompt'][0])

print("=================")

print(train_dataset['completion'][0])

You are an assistant that answers questions about meeting transcripts.

Meeting Transcript:
Grad E: Right .
Postdoc A: Mm - hmm .
Professor C: and then it 's IBM .
Postdoc A: Mm - hmm , mm - hmm .
Grad E: Right .
Professor C: OK , so you might as well ha run the automatic thing over the entire meeting , and then {disfmarker} and then , uh , you would give IBM whatever was fixed .
Postdoc A: And have them fix it over the entire meeting too ?
Grad E: Right .
Professor C: Well , yeah , but start from the beginning and go to the end , right ? So if they were only half way through then that 's what you 'd give IBM .
Postdoc A: OK .
Professor C: Right ?
PhD B: As of what point ? I mean . The {disfmarker} I guess the question on my mind is do we wait for the transcribers to adjust the marks for the whole meeting before we give anything to IBM , or do we go ahead and send them a sample ? Let their {disfmarker}
Professor C: Why wouldn't we s @ @ w i if they were going sequentially through it , 

In [6]:
train_prompt_style="""{}


Answer: {}
"""

def formatting_prompts_func(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    texts = []
    for prompt, completion in zip(prompts, completions):
        # Append the EOS token to the response if it's not already there
        if not completion.endswith(tokenizer.eos_token):
            completion += tokenizer.eos_token
        text = train_prompt_style.format(prompt, completion)
        texts.append(text)
    return {"text": texts}

In [7]:
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)
print(train_dataset["text"][0])

You are an assistant that answers questions about meeting transcripts.

Meeting Transcript:
Grad E: Right .
Postdoc A: Mm - hmm .
Professor C: and then it 's IBM .
Postdoc A: Mm - hmm , mm - hmm .
Grad E: Right .
Professor C: OK , so you might as well ha run the automatic thing over the entire meeting , and then {disfmarker} and then , uh , you would give IBM whatever was fixed .
Postdoc A: And have them fix it over the entire meeting too ?
Grad E: Right .
Professor C: Well , yeah , but start from the beginning and go to the end , right ? So if they were only half way through then that 's what you 'd give IBM .
Postdoc A: OK .
Professor C: Right ?
PhD B: As of what point ? I mean . The {disfmarker} I guess the question on my mind is do we wait for the transcribers to adjust the marks for the whole meeting before we give anything to IBM , or do we go ahead and send them a sample ? Let their {disfmarker}
Professor C: Why wouldn't we s @ @ w i if they were going sequentially through it , 

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # we're doing causal LM, not masked LM
)

# Model inference before fine-tuning

In [9]:
print(train_dataset["prompt"][0] + "\n\nAnswer:" + tokenizer.eos_token)

You are an assistant that answers questions about meeting transcripts.

Meeting Transcript:
Grad E: Right .
Postdoc A: Mm - hmm .
Professor C: and then it 's IBM .
Postdoc A: Mm - hmm , mm - hmm .
Grad E: Right .
Professor C: OK , so you might as well ha run the automatic thing over the entire meeting , and then {disfmarker} and then , uh , you would give IBM whatever was fixed .
Postdoc A: And have them fix it over the entire meeting too ?
Grad E: Right .
Professor C: Well , yeah , but start from the beginning and go to the end , right ? So if they were only half way through then that 's what you 'd give IBM .
Postdoc A: OK .
Professor C: Right ?
PhD B: As of what point ? I mean . The {disfmarker} I guess the question on my mind is do we wait for the transcribers to adjust the marks for the whole meeting before we give anything to IBM , or do we go ahead and send them a sample ? Let their {disfmarker}
Professor C: Why wouldn't we s @ @ w i if they were going sequentially through it , 

In [10]:
inputs = tokenizer(
    [train_dataset["prompt"][0] + "\n\nAnswer: " + tokenizer.eos_token],
    return_tensors="pt"
).to("cuda")

In [11]:
outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

In [12]:
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0])

You are an assistant that answers questions about meeting transcripts.

Meeting Transcript:
Grad E: Right .
Postdoc A: Mm - hmm .
Professor C: and then it 's IBM .
Postdoc A: Mm - hmm , mm - hmm .
Grad E: Right .
Professor C: OK , so you might as well ha run the automatic thing over the entire meeting , and then {disfmarker} and then , uh , you would give IBM whatever was fixed .
Postdoc A: And have them fix it over the entire meeting too ?
Grad E: Right .
Professor C: Well , yeah , but start from the beginning and go to the end , right ? So if they were only half way through then that 's what you 'd give IBM .
Postdoc A: OK .
Professor C: Right ?
PhD B: As of what point ? I mean . The {disfmarker} I guess the question on my mind is do we wait for the transcribers to adjust the marks for the whole meeting before we give anything to IBM , or do we go ahead and send them a sample ? Let their {disfmarker}
Professor C: Why wouldn't we s @ @ w i if they were going sequentially through it , 

# Setting up the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import LoraConfig

# LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=64,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=32,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)


# Training Arguments
training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=5,
    logging_steps=0.1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="none"
)

# Initialize the Trainer
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    data_collator=data_collator,
)

Converting eval dataset to ChatML: 100%|██████████| 237/237 [00:00<00:00, 7966.61 examples/s]
Adding EOS to eval dataset: 100%|██████████| 237/237 [00:00<00:00, 12426.71 examples/s]
Tokenizing eval dataset: 100%|██████████| 237/237 [00:00<00:00, 237.90 examples/s]
Truncating eval dataset: 100%|██████████| 237/237 [00:00<00:00, 12126.85 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# Model training

In [14]:
torch.cuda.empty_cache()
trainer_stats = trainer.train()

Step,Training Loss
274,3.9322
548,3.5901
822,2.8561
1096,2.6111
1370,1.6005
1644,1.5101
1918,0.7471
2192,0.6758
2466,0.2811


# Saving the model and tokenizer

In [15]:
new_model_online = "TWongsamut/Gemma-3-12B-QMSum-QA-v3"
new_model_local = "Gemma-3-12B-QMSum-QA-v3"
model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

('Gemma-3-12B-QMSum-QA-v3/tokenizer_config.json',
 'Gemma-3-12B-QMSum-QA-v3/special_tokens_map.json',
 'Gemma-3-12B-QMSum-QA-v3/tokenizer.model',
 'Gemma-3-12B-QMSum-QA-v3/added_tokens.json',
 'Gemma-3-12B-QMSum-QA-v3/tokenizer.json')

In [29]:
model.push_to_hub(new_model_online) # Online saving
tokenizer.push_to_hub(new_model_online) # Online saving

model-00006-of-00006.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



model-00006-of-00006.safetensors:   2%|▏         | 3.65M/241M [00:00<00:06, 36.0MB/s]

[A[A


[A[A[A
[A


[A[A[A

[A[A

model-00006-of-00006.safetensors:   7%|▋         | 16.0M/241M [00:00<00:11, 18.8MB/s]
model-00006-of-00006.safetensors:  10%|█         | 24.6M/241M [00:01<00:10, 20.8MB/s]


[A[A[A


[A[A[A
[A

[A[A

[A[A
[A

[A[A
model-00006-of-00006.safetensors:  11%|█         | 27.0M/241M [00:02<00:24, 8.81MB/s]
model-00006-of-00006.safetensors:  13%|█▎        | 31.8M/241M [00:02<00:18, 11.4MB/s]


[A[A[A


[A[A[A

[A[A


model-00006-of-00006.safetensors:  14%|█▍        | 33.7M/241M [00:02<00:29, 7.06MB/s]
[A
[A

[A[A
model-00006-of-00006.safetensors:  23%|██▎       | 54.2M/241M [00:03<00:10, 17.4MB/s]


[A[A[A

[A[A


[A[A[A
[A

[A[A
model-00006-of-00006.safetensors:  24%|██▍       | 57.9M/241M [00:04<00:18, 9.75MB/s]

model-00006-

CommitInfo(commit_url='https://huggingface.co/TWongsamut/Gemma-3-12B-QMSum-QA-v3/commit/88c35a30b57fc1e9374b4eecba541cc2c4a8b402', commit_message='Upload tokenizer', commit_description='', oid='88c35a30b57fc1e9374b4eecba541cc2c4a8b402', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TWongsamut/Gemma-3-12B-QMSum-QA-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='TWongsamut/Gemma-3-12B-QMSum-QA-v3'), pr_revision=None, pr_num=None)

# Model inference after fine-tuning

In [17]:
# from transformers import AutoTokenizer, Gemma3ForConditionalGeneration

# import torch

In [None]:
# GEMMA_PATH = "Gemma-3-12B-QMSum-QA-v2"

# model = Gemma3ForConditionalGeneration.from_pretrained(
#     GEMMA_PATH,
#     torch_dtype=torch.bfloat16,
#     attn_implementation='eager',
#     # load_in_8bit=True,
#     device_map="auto",
# ).eval()

# tokenizer = AutoTokenizer.from_pretrained(GEMMA_PATH)

In [19]:
# from datasets import load_from_disk

# train_dataset = load_from_disk("processed_data/train")
# val_dataset = load_from_disk("processed_data/val")

In [28]:
inputs = tokenizer(
    [train_dataset["prompt"][1] + "\n\nAnswer: "],
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=50,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(response[0])

You are an assistant that answers questions about meeting transcripts.

Meeting Transcript:
Grad E: Whereas I think it it 's probably something pathologic and actually Stephane 's results , I think confirm that . He s he did the Aurora system also got very lousy average error , like fifteen or {disfmarker} or , uh , fifteen to twenty percent average ? But then he ran it just on the lapel , and got about five or six percent word error ? So that {disfmarker} that means to me that somewhere in the other recordings there are some pathological cases . But , you know , we {disfmarker} th that may not be true . It may be just some of the segments they 're just doing a lousy job on . So I 'll {disfmarker} I 'll listen to it and find out since you 'd actually split it up by segment .
Professor C: Right .
Grad E: So I can actually listen to it .
PhD F: Yeah .
PhD B: Did you run the {disfmarker} Andreas {disfmarker} the r SRI recognizer on the digits ?
Grad E: Oh , I thought he had sent that arou