In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import pandas as pd
import numpy as np
from utils.constants import WEAVIATE_URL, WEAVIATE_API_KEY, WEAVIATE_INDEX_NAME

In [4]:
df = pd.read_csv('../Data/final_metadata.csv')

In [5]:
#Combine title, synopsis, and Genre
df['soup'] = df.apply(lambda row: f"Title: {row['title']}. Genres: {row['genres']}. Keywords: {row['keywords']}. Cast: {row['cast']}. Director: {row['director']}.", axis=1)
df['soup'][0]

'Title: Godzilla x Kong: The New Empire. Genres: Science Fiction Action Adventure. Keywords: giantmonster sequel dinosaur kaiju fantasyworld giantape godzilla kingkong mongkey. Cast: RebeccaHall BrianTyreeHenry DanStevens. Director: Adam Wingard.'

In [9]:
import weaviate
from langchain.vectorstores import Weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore

## Connecting to Weaviate Cloud


In [10]:
import weaviate
from weaviate.auth import AuthApiKey

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=AuthApiKey(WEAVIATE_API_KEY),  
)

In [12]:
from langchain_huggingface import HuggingFaceEmbeddings

# specify embedding model (using huggingface sentence transformer)
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name, 
  model_kwargs=model_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [13]:
vector_db = WeaviateVectorStore(
    client = client,
    index_name = WEAVIATE_INDEX_NAME,
    text_key = "text",
    embedding = embeddings
)

In [14]:
soups = pd.Series(df['soup'].values, index=df['title'])

In [15]:
soups.head()

title
Godzilla x Kong: The New Empire     Title: Godzilla x Kong: The New Empire. Genres...
Meg 2: The Trench                   Title: Meg 2: The Trench. Genres: Action Scien...
The Pope's Exorcist                 Title: The Pope's Exorcist. Genres: Horror Mys...
Transformers: Rise of the Beasts    Title: Transformers: Rise of the Beasts. Genre...
Dune: Part Two                      Title: Dune: Part Two. Genres: Science Fiction...
dtype: object

In [16]:
def get_recommendations(title):
    
    if title not in soups:
        raise ValueError(f"Title '{title}' not found in indices")
    
    query = soups.get(title)  
    
    results = vector_db.similarity_search(query, k=11)

    top_ten = []

    for x in results[1:]:
        movie_metadata = {
            'movie': x.metadata['movie'],
            'language': x.metadata['language'],
            'popularity': x.metadata['popularity'],
            'score': round(x.metadata['score'],1),
            'synopsis': x.metadata['synopsis'],
            'year': x.metadata['year'],
            'poster_path': x.metadata['poster_path'],
        }
        top_ten.append(movie_metadata)

    df_top_ten = pd.DataFrame(top_ten)
    similarities = df_top_ten.sort_values(by=['score', 'popularity'], ascending=[False, False])[['movie', 'language','score','year']]

    return similarities, df_top_ten

In [17]:
similarities, result_df = get_recommendations('The Dark Knight Rises')
similarities

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Unnamed: 0,movie,language,score,year
0,The Dark Knight,English,8.5,2008.0
2,Batman Begins,English,7.7,2005.0
5,The Batman,English,7.6,2022.0
4,"Batman: The Dark Knight Returns, Part 2",English,7.5,2013.0
3,"Batman: The Dark Knight Returns, Part 1",English,7.3,2012.0
8,Batman,English,7.2,1989.0
1,Batman: The Dark Knight Returns,English,6.7,2013.0
6,Batman: Gotham Knight,English,6.6,2008.0
9,Batman: Gotham by Gaslight,English,6.6,2018.0
7,Knights of Badassdom,English,6.2,2013.0


In [18]:
result_df

Unnamed: 0,movie,language,popularity,score,synopsis,year,poster_path
0,The Dark Knight,English,126.226,8.5,Batman raises the stakes in his war on crime. ...,2008.0,/qJ2tW6WMUDux911r6m7haRef0WH.jpg
1,Batman: The Dark Knight Returns,English,15.758,6.7,Batman has not been seen for ten years. A new ...,2013.0,/9uydeANO6PLpD6Irh7qLizaEMjk.jpg
2,Batman Begins,English,93.893,7.7,Driven by tragedy billionaire Bruce Wayne dedi...,2005.0,/4MpN4kIEqUjW8OPtOQJXlTdHiJV.jpg
3,"Batman: The Dark Knight Returns, Part 1",English,32.236,7.3,Batman has not been seen for ten years. A new ...,2012.0,/kkjTbwV1Xnj8wBL52PjOcXzTbnb.jpg
4,"Batman: The Dark Knight Returns, Part 2",English,29.118,7.5,Batman has stopped the reign of terror that Th...,2013.0,/arEZYd6uMOFTILne9Ux0A8qctMe.jpg
5,The Batman,English,165.673,7.6,In his second year of fighting crime Batman un...,2022.0,/74xTEgt7R36Fpooo50r9T25onhq.jpg
6,Batman: Gotham Knight,English,26.504,6.6,A chronicle of Bruce Wayne's establishment and...,2008.0,/3i1o0sHBP0VUpuSVmkdCRKYoDBC.jpg
7,Knights of Badassdom,English,7.841,6.2,Three best friends and dedicated roleplayers t...,2013.0,/7TP8E4nTqT0SFkwlj2kYRfMRwfH.jpg
8,Batman,English,56.752,7.2,Batman must face his most ruthless nemesis whe...,1989.0,/cij4dd21v2Rk2YtUQbV5kW69WB2.jpg
9,Batman: Gotham by Gaslight,English,17.596,6.6,In an alternative Victorian Age Gotham City Ba...,2018.0,/7souLi5zqQCnpZVghaXv0Wowi0y.jpg


## Generate Summaries

In [19]:
# fixing unicode error in google colab
import locale

locale.getpreferredencoding = lambda: "UTF-8"

# import dependencies
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)

### Setting up the LLM for inference using Langchain

In [20]:
from langchain.llms import HuggingFacePipeline
from langchain import HuggingFacePipeline

In [22]:
model = AutoModelForCausalLM.from_pretrained("../saved-model/quantized_model")
tokenizer = AutoTokenizer.from_pretrained("../saved-model/quantized_tokenizer")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
# specify stop token ids
stop_token_ids = [0]

tokenizer.model_max_length = 2048

# build huggingface pipeline for using zephyr-7b-alpha
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=1,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    truncation=True
)

In [24]:
# specify the llm
llm = HuggingFacePipeline(pipeline=pipeline)

  warn_deprecated(


## Pipeline

In [25]:

class MovieRecommender:
    def __init__(self, soups, vector_db, llm):
        self.soups = soups
        self.vector_db = vector_db
        self.llm = llm


    def get_recommendations(self, title):
        
        if title not in self.soups:
            raise ValueError(f"Title '{title}' not found in indices")
        
        query = self.soups.get(title)  
        
        results = self.vector_db.similarity_search(query, k=11)

        top_ten = []

        for x in results[1:]:
            movie_metadata = {
                'movie': x.metadata['movie'],
                'language': x.metadata['language'],
                'popularity': x.metadata['popularity'],
                'score': round(x.metadata['score'],1),
                'synopsis': x.metadata['synopsis'],
                'year': x.metadata['year'],
                'poster_path': x.metadata['poster_path']
            }
            top_ten.append(movie_metadata)

        df_top_ten = pd.DataFrame(top_ten)
        df_top_ten.sort_values(by=['score', 'popularity'], ascending=[False, False], inplace=True)
        similarities = df_top_ten[['movie', 'language','score','year']]

        return similarities, df_top_ten

    def _get_summary(self, movie: str, language: str, score: str, synopsis: str, year: str):
        # Define a prompt template
        prompt_template = """
        Write a brief summary based on the providedinformation. Do not repeat the question in the output.
        Movie: {movie}
        Language: {language}
        Weighted score: {score}
        Plot Overview: {synopsis}
        Year: {year}
        Summary: 
        """
        
        prompt = prompt_template.format(movie=movie, 
                                        language=language, 
                                        score=score,
                                        synopsis=synopsis,
                                        year=year
                                        )
        response = llm(prompt)        
        # Extract the response part without unwanted tags
        if 'Summary:' in response:
            response = response.split('Summary:')[1].strip()
        
        # Remove specific response tags like "- [response]:"
        response = response.split("Explanation:")[-1].strip()
        
        return response
    
    def generate_summaries(self, df_top_ten):
        """
        Generate summaries for each row in the provided DataFrame.
        """
        summaries = []
        for _, row in df_top_ten.iterrows():
            summary = self._get_summary(
                row['movie'], row['language'], row['score'], row['synopsis'], row['year']
            )
            summaries.append(summary)
        return summaries

In [26]:
recommender = MovieRecommender(soups, vector_db, llm)

In [27]:
title = "Interstellar"

In [28]:
similarities, df_top_ten = recommender.get_recommendations(title)
similarities

Unnamed: 0,movie,language,score,year
9,2001: A Space Odyssey,English,8.0,1968.0
1,The Martian,English,7.7,2015.0
5,Arrival,English,7.5,2016.0
8,Passengers,English,6.9,2016.0
0,Interstellar: Nolan's Odyssey,English,6.7,2014.0
4,Star Trek: The Motion Picture,English,6.5,1979.0
6,Millennium,English,6.3,1989.0
3,Capsule,English,6.3,2015.0
7,Approaching the Unknown,English,6.1,2016.0
2,Lost in Space,English,5.8,1998.0


In [29]:
df_top_ten

Unnamed: 0,movie,language,popularity,score,synopsis,year,poster_path
9,2001: A Space Odyssey,English,42.072,8.0,Humanity finds a mysterious object buried bene...,1968.0,/ve72VxNqjGM69Uky4WTo2bK6rfq.jpg
1,The Martian,English,48.685,7.7,During a manned mission to Mars Astronaut Mark...,2015.0,/5BHuvQ6p9kfc091Z8RiFNhCwL4b.jpg
5,Arrival,English,54.631,7.5,Taking place after alien crafts land around th...,2016.0,/x2FJsf1ElAgr63Y3PNPtJrcmpoe.jpg
8,Passengers,English,29.563,6.9,A spacecraft traveling to a distant colony pla...,2016.0,/jK9S6HANSf2no64v1x1HxfcpmcA.jpg
0,Interstellar: Nolan's Odyssey,English,9.453,6.7,A look behind the lens of Christopher Nolan's ...,2014.0,/i4PpBcuLvdcJwIf3hkcV9QDR1iH.jpg
4,Star Trek: The Motion Picture,English,22.086,6.5,When a destructive space entity is spotted app...,1979.0,/wfiAfNwH6CMKxz4vRaW8CPTabtk.jpg
6,Millennium,English,9.296,6.3,An investigator seeking the cause of an airlin...,1989.0,/mFGCK7RnFnRXR6CweAhQw030Zv7.jpg
3,Capsule,English,6.265,6.3,Guy is an experienced British fighter pilot wh...,2015.0,/9X01Tq2U9mFCdJSdkUkRKnbJQcV.jpg
7,Approaching the Unknown,English,8.039,6.1,Captain William Stanaforth is on a one-way sol...,2016.0,/zubKPqGvXog7LWMyVVZChCo4E8L.jpg
2,Lost in Space,English,19.636,5.8,The prospects for continuing life on Earth in ...,1998.0,/4miEpZmUOMqV8P0T6oq5HVBiVHw.jpg


In [30]:
summaries = recommender.generate_summaries(df_top_ten)
summaries

  warn_deprecated(
You are not running the flash-attention implementation, expect numerical differences.


["2001: A Space Odyssey, released in 1968, is an English-language science fiction film that explores humanity's quest to uncover the origins of a mysterious object discovered beneath the lunar surface. With a weighted score of 8.0, the movie features HAL 9000, the world's most advanced supercomputer, as a key element in the story. The film delves into themes of artificial intelligence, human evolution, and the unknown depths of space.",
 "The Martian, a 2015 English-language film, tells the story of astronaut Mark Watney, who is presumed dead after a severe storm during a manned mission to Mars. However, Watney miraculously survives and becomes stranded on the inhospitable planet. With limited resources, he must rely on his ingenuity, wit, and spirit to survive and find a way to communicate with Earth that he is alive. The movie, with a weighted score of 7.7, showcases Watney's resilience and determination in the face of adversity.",
 'Arrival, a 2016 English-language film, features an