In [None]:
!pip install -q transformers datasets accelerate torch


In [None]:
import torch
torch.cuda.is_available()


In [None]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

dataset


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch



In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token


In [None]:
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


In [None]:

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

In [None]:
tokenized_dataset = tokenized_dataset.map(
    lambda x: {"labels": x["input_ids"]},
    batched=True
)


In [None]:
from transformers import Trainer, TrainingArguments


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_steps=100,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]
)


In [None]:
trainer.train()


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
SAVE_PATH = "/content/drive/MyDrive/llm_next_word_project"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

SAVE_PATH = "/content/drive/MyDrive/llm_next_word_project"

tokenizer = GPT2Tokenizer.from_pretrained(SAVE_PATH)
model = GPT2LMHeadModel.from_pretrained(SAVE_PATH)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()


In [None]:
def predict_next_words_llm(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.8
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
print(predict_next_words_llm("India is"))


In [None]:
print(predict_next_words_llm("Technology will"))
