In [10]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re

In [2]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

# Set the default device for tensors
torch.cuda.set_device(0)  # Set the GPU you want to use if you have multiple GPUs

Using NVIDIA GeForce GTX 1650


## 1 - Dataset and LLM

In [3]:
dataset = load_dataset("imdb")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [11]:
# Function to clean text from HTML tags
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Remove escape characters
    text = re.sub(r'\\', '', text)
    return text

In [12]:
# Clean the text in the dataset
for split in ['train', 'test', 'unsupervised']:
    dataset[split] = dataset[split].map(lambda example: {'text': clean_text(example['text']), 'label': example['label']})

In [5]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
from transformers import pipeline

pipe = pipeline(task="text2text-generation", model=original_model, tokenizer=original_tokenizer, device=device)

In [14]:
def calculate_summary(input_text):
    prompt = "Summarize the following movie review.: " + input_text + "\n\nSentiment:"
    output = pipe(prompt, max_length=512)
    return output[0]['generated_text']

In [15]:
input_text = original_tokenizer.decode(example['input_ids'], skip_special_tokens=True)
print(input_text)
# prediction = calculate_sentiment(input_text)

NameError: name 'example' is not defined

In [7]:
index = 200

text = dataset['test']['text'][index]
label = dataset['test']['label'][index]

prompt = f"""
Summarize the following movie review.

{text}

Summary: """

inputs = original_tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True)
output = original_tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

# Convert the label to "negative" if 0, and "positive" if 1
sentiment_label = "positive" if label == 1 else "negative"

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE IMDB SENTIMENT:\n{sentiment_label}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following movie review.

He who fights with monsters might take care lest he thereby become a monster. And if you gaze for long into an abyss, the abyss gazes also into you.<br /><br />Yes, this is from Nietzsche's Aphorism 146 from "Beyond Good and Evil". And that's what you find at the start of this movie.<br /><br />If you watch the whole movie, you will doubt if it was the message that the Ram Gopal Varma Production wanted to pass on. As the scenes crop up one by one, quite violent and at times puke-raking, the viewer is expected to forget the Nietzsche quote and think otherwise. That to deal with few people you need dedicated people like Sadhu Agashe who will have the licence to kill anyone, not just writing FIRs (something unworthy of the police to do, as we are made to believe).<br /><br />When TADA was repealed and the government wanted to pass newer 