In [13]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
import time
import evaluate
import pandas as pd
import numpy as np

from peft import LoraConfig, get_peft_model, TaskType, LoraConfig, get_peft_model, TaskType, PeftModel
import torch.nn.functional as F

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Sentiment Model

In [4]:
model_name='google/flan-t5-base'

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

peft_model = PeftModel.from_pretrained(base_model, './peft-dialogue-summary-checkpoint-local/')
peft_model = peft_model.merge_and_unload()

tokenizer_peft = AutoTokenizer.from_pretrained("./peft-dialogue-summary-checkpoint-local/", trust_remote_code=True)
peft_model.pad_token_id = tokenizer_peft.pad_token_id
peft_model.config.pad_token_id = tokenizer_peft.pad_token_id
peft_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [16]:
# Adjust generation parameters
generation_config = GenerationConfig(
    max_new_tokens=1,
    num_beams=2,      # Increase the number of beams for better exploration but more computation need
    temperature=0.0,  # Set to 0 to eliminate randomness
    top_k=1,          # Setting top_k to 1 ensures only the most probable token is considered
    top_p=1.0         # Setting top_p to 1.0 effectively disables nucleus sampling
)

def get_sentiment(text):
    prompt = f"""
    classify sentiment: 
    {text}
    
    Sentiment:
    """
    # Start measuring time
    start_time = time.time()

    # Tokenize the input and move to the same device
    input_ids = tokenizer_peft(prompt, 
                               return_tensors='pt',
                               max_length=512,
                               truncation=True,
                               padding='max_length'
    ).input_ids.to(device)

    # Generate output with the PEFT model
    with torch.no_grad():
        peft_output = peft_model.generate(input_ids=input_ids, generation_config=generation_config)
    
    peft_model_sentiment_output = tokenizer_peft.decode(peft_output[0], skip_special_tokens=True)
    
    # End measuring time
    end_time = time.time()
    elapsed_time = end_time - start_time

    return(peft_model_sentiment_output, elapsed_time)

In [37]:
def count_tokens(text, tokenizer):
    """
    Counts the number of tokens in the given text using the specified tokenizer.
    
    Parameters:
    text (str): The input text to tokenize.
    tokenizer: The tokenizer object used for encoding the text.
    
    Returns:
    int: The number of tokens in the input text.
    """
    # Tokenize the text and return the length of the token list
    tokens = tokenizer(text, return_tensors='pt', truncation=False, padding=False).input_ids.to(device)
    return len(tokens[0])

## Summarization Model

In [41]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

pegasus_sum = 'tuner007/pegasus_summarizer'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_sum)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_sum).to(torch_device)

In [64]:
def get_summarize(input_text):
  batch = pegasus_tokenizer([input_text],truncation=True,padding='longest',max_length=1024, return_tensors="pt").to(torch_device)
  gen_out = pegasus_model.generate(
        **batch,
        max_length=128,
        num_beams=3,
        temperature=0.7,        # Lower temperature to reduce randomness
        top_k=50,               # Use top_k sampling
        top_p=0.95,             # Use top_p sampling
        repetition_penalty=2.0  # Penalize repetition
    )
  output_text = pegasus_tokenizer.batch_decode(gen_out, skip_special_tokens=True)
  return output_text

## BS4 setup

In [17]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from imdb import IMDb

In [18]:
def scrape_imdb_reviews(imdb_id):
    # Construct the URL for the movie's reviews page
    url = f"https://www.imdb.com/title/tt{imdb_id}/reviews?sort=totalVotes"

    # Send a GET request to the URL
    response = requests.get(url)
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all review elements on the page
    reviews = soup.find_all("div", class_="text show-more__control")

    # Extract the text of each review
    review_texts = [review.get_text() for review in reviews]

    return review_texts

def get_imdb_id(movie_title):
    # Create an instance of the IMDb class
    ia = IMDb()

    # Search for a movie by title
    results = ia.search_movie(movie_title)

    # Return the IMDb ID of the first result, if available
    if results:
        return results[0].movieID
    else:
        return None

## Some testing

In [45]:
imdb_id = get_imdb_id("The Shawshank Redemption")
reviews = scrape_imdb_reviews(imdb_id)
init_sentiment, init_runtime = get_sentiment(reviews[0])

paragraph_sum = get_summarize(reviews[0])
sum_sentiment, sum_runtime = get_sentiment(paragraph_sum)

dash_line = '-' * 100
print(dash_line)
print("Initial paragraph: ", reviews[0])
print("Initial sentiment: ", init_sentiment)
print("Initial runtime: ", init_runtime)

print(dash_line)
print("Summary: ", paragraph_sum)
print("Sentiment Summary: ", sum_sentiment)
print("Runtime Summary: ", sum_runtime)

----------------------------------------------------------------------------------------------------
Initial paragraph:  Can Hollywood, usually creating things for entertainment purposes only, create art?  To create something of this nature, a director must approach it in a most meticulous manner, due to the delicacy of the process.  Such a daunting task requires an extremely capable artist with an undeniable managerial capacity and an acutely developed awareness of each element of art in their films, the most prominent; music, visuals, script, and acting. These elements, each equally important, must succeed independently, yet still form a harmonious union, because this mixture determines the fate of the artist's opus.  Though already well known amongst his colleagues for his notable skills at writing and directing, Frank Darabont emerges with his feature film directorial debut, The Shawshank Redemption.  Proving himself already a master of the craft, Darabont managed to create one of 

In [52]:
imdb_id = get_imdb_id("Plowjaifjafheauife")

In [54]:
print(imdb_id)

None


In [51]:
start_time = time.time()
try_sent = get_summarize(reviews[0])
end_time = time.time()
token_count = count_tokens(try_sent, tokenizer_peft)
elapsed_time = end_time - start_time
print(elapsed_time)
print(f"Number of tokens: {token_count}")

37.20534944534302
Number of tokens: 128


In [65]:
def show_info(title):
    imdb_id = get_imdb_id(title)
    if imdb_id == None:
        return 'Title not found'
    
    reviews = scrape_imdb_reviews(imdb_id)
    summary_reviews = []
    
    for review in reviews:
        summarize = get_summarize(review)
        sentiment = get_sentiment(summarize)
        summary_reviews.append((summarize, sentiment))

    return summary_reviews

In [66]:
x = show_info("Interstellar")

In [67]:
x

[(['\'Interstellar\' is "the best sci-fi movie of the decade, about mankind, about pioneers, about humanity, and probably the best movie of the decade ever made", wrote Bollywood Hungama. "It can only be a 10, because in itself, it is a beginning for a new kind of cinema," it added. The film also starred Matthew McConaughey, Anne Hathaway, Michael Caine and Ben Affleck.'],
  ('positive', 2.2364296913146973)),
 (['Christopher Nolan\'s \'Interstellar\' is the first film of a new stage in Nolan\'s filmography, Steven Soderbergh said. "It\'s the proof that Nolan has finally managed to create a masterpiece...despite all the expectations that were placed on him after the success of \'The Dark Knight\'," he added. "This is much more than just a sci-fi movie," Soderbergh further said.'],
  ('positive', 2.3036580085754395)),
 (['After watching this insane movie in the theatres back in 2014 I swore to god I will wait 5 years to watch it again so I get to forget it and experience the insanity it 

In [None]:
from collections import defaultdict

grouped_reviews = defaultdict(lambda: {"positive": [], "negative": []})

for review, sentiment in x:
    grouped_reviews[title][sentiment].append(review)