<a href="https://colab.research.google.com/github/VeeraAdhi/NM-projects/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import Dataset  # Import the Dataset class

# Your provided dataset
data = [
    {
        "product_id": "B00004R99U",
        "product_name": "Logitech Wireless Mouse M325",
        "user_id": "A141J4181G1999",
        "rating": 5,
        "title": "Great mouse for the price!",
        "review_text": "I've been using this mouse for over a year now and it's still going strong. The wireless connection is reliable, and it fits comfortably in my hand. Battery life is also excellent. Highly recommended!",
        "timestamp": "1364899200",
        "helpful_votes": 10,
        "total_votes": 12
    },
    {
        "product_id": "B00004R99U",
        "product_name": "Logitech Wireless Mouse M325",
        "user_id": "A3SPW1H9WWGY5",
        "rating": 3,
        "title": "Decent, but could be better",
        "review_text": "It works as expected, but the scroll wheel feels a bit cheap and sometimes gets stuck. The size is a little small for my large hands. Okay for basic use.",
        "timestamp": "1375267200",
        "helpful_votes": 3,
        "total_votes": 5
    },
    {
        "product_id": "B00171APVA",
        "product_name": "Kindle Paperwhite (Previous Generation - 5th)",
        "user_id": "A2R6PLI49H436",
        "rating": 4,
        "title": "Still a great e-reader",
        "review_text": "Even though it's an older model, the Kindle Paperwhite is still fantastic for reading. The e-ink screen is easy on the eyes, and the battery lasts for weeks. Misses some of the newer features, but it does the job well.",
        "timestamp": "1403913600",
        "helpful_votes": 25,
        "total_votes": 28
    },
    {
        "product_id": "B00171APVA",
        "product_name": "Kindle Paperwhite (Previous Generation - 5th)",
        "user_id": "A1U36F9XGF3Y8",
        "rating": 1,
        "title": "Screen died after 3 months",
        "review_text": "Very disappointed with this purchase. The screen stopped working completely after only three months of light use. Customer service was unhelpful. Would not recommend.",
        "timestamp": "1411257600",
        "helpful_votes": 1,
        "total_votes": 15
    },
    {
        "product_id": "B00GUF1E8G",
        "product_name": "Anker PowerCore 10000 Portable Charger",
        "user_id": "AWGJL3894797F",
        "rating": 5,
        "title": "Best portable charger I've owned",
        "review_text": "This power bank is a lifesaver! It's compact, charges my phone multiple times, and feels very well-built. A must-have for travel.",
        "timestamp": "1459296000",
        "helpful_votes": 18,
        "total_votes": 20
    },
    {
        "product_id": "B00GUF1E8G",
        "product_name": "Anker PowerCore 10000 Portable Charger",
        "user_id": "A29B991XLSY45",
        "rating": 4,
        "title": "Good capacity, a bit bulky",
        "review_text": "It holds a lot of charge, which is great for long trips. However, it's a bit heavier and bulkier than I expected. Still a good product overall.",
        "timestamp": "1467331200",
        "helpful_votes": 7,
        "total_votes": 8
    }
]

# We will focus on fine-tuning GPT2 to generate text similar to the 'review_text'
texts = [item['review_text'] for item in data]

# Create a Hugging Face Dataset object
raw_dataset = Dataset.from_dict({'text': texts})

# Load tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
output_dir = "./gpt2-finetuned-reviews"
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs-reviews',
    report_to="none",
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the finetuned model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Finetuned model saved to {output_dir}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Finetuned model saved to ./gpt2-finetuned-reviews


In [None]:
!pip install gradio
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1. Load the finetuned model and tokenizer (ensure correct path)
finetuned_model_path = "./gpt2-finetuned-reviews"
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)
model = AutoModelForCausalLM.from_pretrained(finetuned_model_path)

def generate_review(product_name, max_length=200, temperature=0.8, top_k=50, top_p=0.95):
    """Generates a product review based on the given product name."""
    prompt = f"Review for {product_name}: "
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_review = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    return generated_review

def review_interface(product_name):
    review = generate_review(product_name)
    return review

iface = gr.Interface(
    fn=review_interface,
    inputs=gr.Textbox(label="Enter Product Name"),
    outputs=gr.Textbox(label="Generated Review"),
    title="Product Review Generator",
    description="Enter a product name and let the AI generate a review based on the finetuned model."
)

iface.launch()

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

