In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
import time
import evaluate
import pandas as pd
import numpy as np

from peft import LoraConfig, get_peft_model, TaskType, LoraConfig, get_peft_model, TaskType, PeftModel
import torch.nn.functional as F

## Load Model

In [2]:
model_name='google/flan-t5-base'

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

peft_model = PeftModel.from_pretrained(base_model, './peft-dialogue-summary-checkpoint-local/')
peft_model = peft_model.merge_and_unload()

tokenizer_peft = AutoTokenizer.from_pretrained("./peft-dialogue-summary-checkpoint-local/", trust_remote_code=True)

In [3]:
peft_model.pad_token_id = tokenizer_peft.pad_token_id
peft_model.config.pad_token_id = tokenizer_peft.pad_token_id

In [4]:
# Move models to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
peft_model.to(device)

# Adjust generation parameters
generation_config = GenerationConfig(
    max_new_tokens=1,
    num_beams=4,  # Increase the number of beams for better exploration
    temperature=0.7,  # Add temperature to control randomness
    top_k=50,  # Add top_k sampling
    top_p=0.95  # Add top_p sampling
)

def get_sentiment(text):
    prompt = f"""
    classify sentiment: 
    {text}
    
    Sentiment:
    """
    # Start measuring time
    start_time = time.time()

    # Tokenize the input and move to the same device
    input_ids = tokenizer_peft(prompt, return_tensors='pt').input_ids.to(device)

    # Generate output with the PEFT model
    with torch.no_grad():
        peft_output = peft_model.generate(input_ids=input_ids, generation_config=generation_config)
    
    peft_model_sentiment_output = tokenizer_peft.decode(peft_output[0], skip_special_tokens=True)
    
    # End measuring time
    end_time = time.time()
    elapsed_time = end_time - start_time

    # dash_line = '-' * 100
    # print(dash_line)
    # print(f'INPUT PROMPT:\n{prompt}')
    # print(dash_line)
    # print(f'PEFT MODEL GENERATION - ZERO SHOT:\n{peft_model_sentiment_output}')
    # print(dash_line)
    # print(f'Running time: {elapsed_time:.2f} seconds')
    return(peft_model_sentiment_output, elapsed_time)

## Scrape movie reviews

In [5]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from imdb import IMDb

In [6]:
def scrape_imdb_reviews(imdb_id):
    # Construct the URL for the movie's reviews page
    url = f"https://www.imdb.com/title/tt{imdb_id}/reviews?sort=totalVotes"

    # Send a GET request to the URL
    response = requests.get(url)
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all review elements on the page
    reviews = soup.find_all("div", class_="text show-more__control")

    # Extract the text of each review
    review_texts = [review.get_text() for review in reviews]

    return review_texts

def get_imdb_id(movie_title):
    # Create an instance of the IMDb class
    ia = IMDb()

    # Search for a movie by title
    results = ia.search_movie(movie_title)

    # Return the IMDb ID of the first result, if available
    if results:
        return results[0].movieID
    else:
        return None

In [7]:
movie_titles = [
    "The Shawshank Redemption",
    "The Godfather",
    "The Dark Knight",
    "The Godfather: Part II",
    "12 Angry Men",
    "Schindler's List",
    "The Lord of the Rings: The Return of the King",
    "Pulp Fiction",
    "The Lord of the Rings: The Fellowship of the Ring",
    "The Good, the Bad and the Ugly",
    "Forrest Gump",
    "The Lord of the Rings: The Two Towers",
    "Fight Club",
    "Inception",
    "Star Wars: Episode V - The Empire Strikes Back",
    "The Matrix",
    "GoodFellas",
    "One Flew Over the Cuckoo's Nest",
    "Se7en",
    "Interstellar"
]

all_reviews = []

for title in movie_titles:
    imdb_id = get_imdb_id(title)
    reviews = scrape_imdb_reviews(imdb_id)[:25]  # Get 25 reviews per movie
    all_reviews.append((title, reviews))

In [10]:
sentiment_reviews = []

for title, reviews in all_reviews:
    for review in reviews:
        sentiment, _ = get_sentiment(review)  # Assuming get_sentiment returns sentiment and runtime
        sentiment_reviews.append((title, review, sentiment))

Token indices sequence length is longer than the specified maximum sequence length for this model (1343 > 512). Running this sequence through the model will result in indexing errors


In [11]:
from collections import defaultdict

grouped_reviews = defaultdict(lambda: {"positive": [], "negative": []})

for title, review, sentiment in sentiment_reviews:
    grouped_reviews[title][sentiment].append(review)

In [12]:
grouped_reviews["The Shawshank Redemption"]["positive"]

["Can Hollywood, usually creating things for entertainment purposes only, create art?  To create something of this nature, a director must approach it in a most meticulous manner, due to the delicacy of the process.  Such a daunting task requires an extremely capable artist with an undeniable managerial capacity and an acutely developed awareness of each element of art in their films, the most prominent; music, visuals, script, and acting. These elements, each equally important, must succeed independently, yet still form a harmonious union, because this mixture determines the fate of the artist's opus.  Though already well known amongst his colleagues for his notable skills at writing and directing, Frank Darabont emerges with his feature film directorial debut, The Shawshank Redemption.  Proving himself already a master of the craft, Darabont managed to create one of the most recognizable independent releases in the history of Hollywood.  The Shawshank Redemption defines a genre, defi

In [8]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

pegasus_sum = 'tuner007/pegasus_summarizer'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_sum)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_sum).to(torch_device)

def get_response(input_text):
  batch = pegasus_tokenizer([input_text],truncation=True,padding='longest',max_length=1024, return_tensors="pt").to(torch_device)
  gen_out = pegasus_model.generate(**batch,max_length=128,num_beams=5, num_return_sequences=1, temperature=1.5)
  output_text = pegasus_tokenizer.batch_decode(gen_out, skip_special_tokens=True)
  return output_text

tokenizer_config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [10]:
example = """I have never seen such an amazing film since I saw The Shawshank Redemption. Shawshank encompasses friendships, hardships, hopes, and dreams.  And what is so great about the movie is that it moves you, it gives you hope.  Even though the circumstances between the characters and the viewers are quite different, you don't feel that far removed from what the characters are going through.It is a simple film, yet it has an everlasting message.  Frank Darabont didn't need to put any kind of outlandish special effects to get us to love this film, the narration and the acting does that for him.  Why this movie didn't win all seven Oscars is beyond me, but don't let that sway you to not see this film, let its ranking on the IMDb's top 250 list sway you, let your friends recommendation about the movie sway you.Set aside a little over two hours tonight and rent this movie.  You will finally understand what everyone is talking about and you will understand why this is my all time favorite movie."""

In [11]:
get_response(example)

["Frank Darabont's 'The Shawshank Redemption' is one of the best films of all time. It's a simple film, yet it has an everlasting message. Darabont didn't need to put any kind of outlandish special effects to get us to love this film, the narration and the acting does that for him. Why this movie didn't win all seven Oscars is beyond me."]

In [None]:
summarized_reviews = []

for title, sentiments in grouped_reviews.items():
    for sentiment, reviews in sentiments.items():
        combined_text = " ".join(reviews)
        # Use a tool or ChatGPT to summarize combined_text
        summary = summarize_text(combined_text)  # Replace with your summarization method
        summarized_reviews.append((title, sentiment, summary))


## Langchain for summarization

In [78]:
# Move models to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
peft_model.to(device)

# Function to summarize text
def summarize_text(text):
    input_text = "Summarize the following movie review: " + text
    input_ids = tokenizer_peft(input_text, return_tensors="pt").input_ids.to(device)
    outputs = peft_model.generate(input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer_peft.decode(outputs[0], skip_special_tokens=True)
    
    return summary

# Function to chunk reviews respecting the token limit
def chunk_reviews(reviews, tokenizer, max_length):
    chunks = []
    current_chunk = ""
    current_length = 0

    for review in reviews:
        review_length = len(tokenizer.tokenize(review))
        if review_length > max_length:
            # If a single review exceeds the max length, split it
            review_chunks = [review[i:i + max_length] for i in range(0, len(review), max_length)]
            chunks.extend(review_chunks)
        else:
            # Check if adding this review to the current chunk would exceed the max length
            if current_length + review_length > max_length:
                # Save the current chunk and start a new one
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = review
                current_length = review_length
            else:
                # Add the review to the current chunk
                if current_chunk:
                    current_chunk += " " + review
                else:
                    current_chunk = review
                current_length += review_length

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [79]:
# Example input reviews
reviews = [
    "Can Hollywood, usually creating things for entertainment purposes only, create art? To create something of this nature, a director must approach it in a most meticulous manner, due to the delicacy of the process. Such a daunting task requires an extremely capable artist with an undeniable managerial capacity and an acutely developed awareness of each element of art in their films, the most prominent; music, visuals, script, and acting. These elements, each equally important, must succeed independently, yet still form a harmonious union, because this mixture determines the fate of the artist's opus. Though already well known amongst his colleagues for his notable skills at writing and directing, Frank Darabont emerges with his feature film directorial debut, The Shawshank Redemption. Proving himself already a master of the craft, Darabont managed to create one of the most recognizable independent releases in the history of Hollywood. The Shawshank Redemption defines a genre, defies the odds, compels the emotions, and brings an era of artistically influential films back to Hollywood.",
    "I have never seen such an amazing film since I saw The Shawshank Redemption. Shawshank encompasses friendships, hardships, hopes, and dreams. And what is so great about the movie is that it moves you, it gives you hope. Even though the circumstances between the characters and the viewers are quite different, you don't feel that far removed from what the characters are going through. It is a simple film, yet it has an everlasting message. Frank Darabont didn't need to put any kind of outlandish special effects to get us to love this film, the narration and the acting does that for him. Why this movie didn't win all seven Oscars is beyond me, but don't let that sway you to not see this film, let its ranking on the IMDb's top 250 list sway you, let your friends recommendation about the movie sway you. Set aside a little over two hours tonight and rent this movie. You will finally understand what everyone is talking about and you will understand why this is my all time favorite movie.",
    "In its Oscar year, Shawshank Redemption (written and directed by Frank Darabont, after the novella Rita Hayworth and the Shawshank Redemption, by Stephen King) was nominated for seven Academy Awards, and walked away with zero. Best Picture went to Forrest Gump, while Shawshank and Pulp Fiction were 'just happy to be nominated.' Of course hindsight is 20/20, but while history looks back on Gump as a good film, Pulp and Redemption are remembered as some of the all-time best. Pulp, however, was a success from the word 'go,' making a huge splash at Cannes and making its writer-director an American master after only two films. For Andy Dufresne and Co., success didn't come easy. Fortunately, failure wasn't a life sentence. After opening on 33 screens with take of $727,327, the $25M film fell fast from theatres and finished with a mere $28.3M. The reasons for failure are many. Firstly, the title is a clunker. While iconic to fans today, in 1994, people knew not and cared not what a 'Shawshank' was. On the DVD, Tim Robbins laughs recounting fans congratulating him on 'that 'Rickshaw' movie.' Marketing-wise, the film's a nightmare, as 'prison drama' is a tough sell to women, and the story of love between two best friends doesn't spell winner to men. Worst of all, the movie is slow as molasses. As Desson Thomson writes for the Washington Post, 'it wanders down subplots at every opportunity and ignores an abundance of narrative exit points before settling on its finale.' But it is these same weaknesses that make the film so strong. Firstly, its setting. The opening aerial shots of the prison are a total eye-opener. This is an amazing piece of architecture, strong and Gothic in design. Immediately, the prison becomes a character. It casts its shadow over most of the film, its tall stone walls stretching above every shot. It towers over the men it contains, blotting out all memories of the outside world. Only Andy (Robbins) holds onto hope. It's in music, it's in the sandy beaches of Zihuatanejo; 'In here's where you need it most,' he says. 'You need it so you don't forget. Forget that there are places in the world that aren't made out of stone. That there's a - there's a - there's something inside that's yours, that they can't touch.' Red (Morgan Freeman) doesn't think much of Andy at first, picking 'that tall glass o' milk with the silver spoon up his ass' as the first new fish to crack. Andy says not a word, and losing his bet, Red resents him for it. But over time, as the two get to know each other, they quickly become the best of friends. This again, is one of the film's major strengths. Many movies are about love, many flicks have a side-kick to the hero, but Shawshank is the only one I can think of that looks honestly at the love between two best friends. It seems odd that Hollywood would skip this relationship time and again, when it's a feeling that weighs so much into everyone's day to day lives. Perhaps it's too sentimental to seem conventional, but Shawshank's core friendship hits all the right notes, and the film is much better for it. It's pacing is deliberate as well. As we spend the film watching the same actors, it is easy to forget that the movie's timeline spans well over 20 years. Such a huge measure of time would pass slowly in reality, and would only be amplified in prison. And it's not as if the film lacks interest in these moments. It still knows where it's going, it merely intends on taking its sweet time getting there. It pays off as well, as the tedium of prison life makes the climax that much more exhilarating. For anyone who sees it, it is a moment never to be forgotten."
]

# Define the chunk size based on the model's token limit
chunk_size = 512  # Adjust this based on the tokenizer's behavior

# Chunk the reviews
chunks = chunk_reviews(reviews, tokenizer_peft, chunk_size)

# Summarize each chunk
summaries = [summarize_text(chunk) for chunk in chunks]

# Combine the summaries
final_summary = " ".join(summaries)

print("Final Summary:")
print(final_summary)

Final Summary:
Shawshank Redemption to Pulp Fiction, and Pulp Fiction to Shawshank Redemption, and Pulp Fiction to Shawshank Redemption, and Pulp Fiction to Shawshank Redemption. The Shawshank Redemption is a tad cheesy, but it's not a bad movie. It's not a bad movie, but it's not a bad movie. It's a shame that Robbins hasn't had a chance to make a film since 'Rickshaw', but he's still a talented director. 'The Prisoner of Azteca' is one of the most powerful films I've seen in a long time. It's a film that's not just about the prison, it's about the people who live there. It's about the people who live there. Red's best friends, and they're the best of the best in a long line of romantic comedies and comedies, and they're the best in a long line of romantic comedies and comedies. Shawshank is a heartwarming, heartwarming film about two best friends who share a love for each other. It's also a heartwarming, heartwarming, heartwarming film about friendship. It's a good movie, but it's al

In [84]:
last_summary = summarize_text(final_summary)
print(last_summary)

The Shawshank Redemption is a tad cheesy, but it's not a bad movie. It's not a bad movie, but it's not a bad movie. It's a shame that Robbins hasn't had a chance to make a film since 'Rickshaw', but he's still a talented director. The Prisoner of Azteca is one of the most powerful films I've seen in a long time. It's a heartwarming, heartwarming, heartwarming film about two best friends who share a love for each other. It's 


## ---------------------

In [46]:
# Example usage
imdb_id = get_imdb_id("Avengers: Endgame")
reviews = scrape_imdb_reviews(imdb_id)
dash_line = '-' * 100
# Print the first few reviews
for i, review in enumerate(reviews, 1):
    print(f"Review {i}:")
    print(review)
    sentiment, runtime = get_sentiment(review)
    print('Sentiment: ', sentiment)
    print('Elapsed Time: ', runtime)
    print(dash_line)

Review 1:
I have to say, my first reaction walking out of the cinema was that it was great. Probably an 8/10. You know there's so much fan service in this movie and I particularly loved the "I can do this all day" CA line from one CA to another. It was almost Toy Story 2, enlightened Buzz to naive Buzz banter.I loved Clever Hulk, found Thor hilarious, though a bit annoying at times, and loved the references to past movies. Cap swinging Mjollnir around was beautiful.The deaths in this movie were also pretty surprising but I agree with some of the more balanced reviews in here that it was bizarre that so much time was spent on Hawkeye - does anyone really care about him? Not really.Killing off Black Widow was a surprising touch, but I, like many others, probably felt more of an emotional reaction to Banner's relationship with her and not Hawkeye (because no one actually cares about him). Renner, as an actor, just doesn't cut it much, unfortunately.The real problem with End Game, is the t