# Loading Dataset

In [1]:
from datasets import load_dataset

token = "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"

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2",use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token

def tokenizer_fn(examples):
    encodings = tokenizer(examples["text"], padding="max_length", truncation=True,max_length=512)
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

count = 0
for example in dataset["train"]:
    if example["text"].strip(): 
        count = count + 1
        print(example)
        if(count == 10):
            break
        



{'text': ' = Valkyria Chronicles III = \n'}
{'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n'}
{'text': " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making t

In [2]:
import accelerate
print(accelerate.__version__)

1.8.1


# Model Training

In [3]:
tokenized_train = dataset["train"].map(tokenizer_fn, batched=True, remove_columns=["text"])
tokenized_val = dataset["validation"].map(tokenizer_fn, batched=True, remove_columns=["text"])
tokenized_test = dataset["test"].map(tokenizer_fn, batched=True, remove_columns=["text"])

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=2,
    fp16=True,
    num_train_epochs=3,
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_train,eval_dataset=tokenized_val,data_collator=data_collator)
trainer.train()

model.save_pretrained("./fine_tuned_model")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.4331,0.422343
2,0.408,0.418494
3,0.4085,0.417911


# Model Evaluation

In [4]:
import math
from evaluate import load

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 1.52


In [19]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
from transformers import default_data_collator

def evaluate_top_k_accuracy(model, dataset, k=5, batch_size=8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()

    loader = DataLoader(dataset, batch_size=batch_size, collate_fn=default_data_collator)  
    total_tokens = 0
    matched_predictions = 0

    for batch in tqdm(loader, desc=f"Evaluating Top-{k} Accuracy"):
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)

        targets = input_ids[:, 1:]
        inputs = input_ids[:, :-1]
        mask = attn_mask[:, :-1]

        with torch.no_grad():
            logits = model(input_ids=inputs, attention_mask=mask).logits  # [B, T, V]

        top_k_indices = logits.topk(k=k, dim=-1).indices  # [B, T, K]
        hits = (top_k_indices == targets.unsqueeze(-1)).any(dim=-1)  # [B, T]

        matched_predictions += hits.sum().item()
        total_tokens += hits.numel()

    accuracy = matched_predictions / total_tokens if total_tokens else 0.0
    return round(accuracy * 100, 2)


In [21]:
top5_acc = evaluate_top_k_accuracy(model, tokenized_test, k=5)
print(f"Top-5 Accuracy: {top5_acc}%")


Evaluating Top-5 Accuracy: 100%|█████████████████████████████████████████████████████| 545/545 [02:08<00:00,  4.23it/s]

Top-5 Accuracy: 95.23%





In [29]:
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.json',
 './fine_tuned_model\\merges.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')

# GUI INTERFACE (DO RUN IT)

In [4]:
import tkinter as tk
from tkinter import font as tkFont
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

window = tk.Tk()
window.title("Next Word Predictor")
window.geometry("700x500")
window.configure(bg="#FFF3C7")

font_title = tkFont.Font(family="Comic Sans MS", size=24, weight="bold")
font_label = tkFont.Font(family="Comic Sans MS", size=16)
font_button = tkFont.Font(family="Comic Sans MS", size=12)

header = tk.Label(window, text="Next Word Predictor", font=font_title, fg="#FF6F61", bg="#FFF3C7")
header.pack(pady=20)

entry = tk.Entry(window, font=font_label, width=40, bg="#FFF6F1", fg="#333")
entry.pack(pady=10)
entry.focus_set()

result_labels = []

def get_top_k_predictions(text, k=3):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        next_token_logits = outputs.logits[:, -1, :]
        top_k_ids = torch.topk(next_token_logits, k).indices[0].tolist()
        predictions = [tokenizer.decode([i]).strip() for i in top_k_ids]
    return predictions

def on_change(event):
    text = entry.get()
    if not text.strip():
        return
    predictions = get_top_k_predictions(text)
    for lbl in result_labels:
        lbl.destroy()
    result_labels.clear()
    for pred in predictions:
        lbl = tk.Label(window, text=pred, font=font_button, bg="#FFDD94", fg="#000", padx=10, pady=5, cursor="hand2",
                       relief="raised", bd=2)
        lbl.bind("<Button-1>", lambda e, p=pred: (entry.insert(tk.END, " " + p.strip()), on_change(None)))
        lbl.pack(pady=5)
        result_labels.append(lbl)


entry.bind("<KeyRelease>", on_change)

window.mainloop()
