In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Data/final_metadata.csv')

In [4]:
df.shape

(19951, 14)

In [5]:
df.drop(columns=['release_date','vote_average', 'vote_count'], inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19951 entries, 0 to 19950
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 19951 non-null  int64  
 1   title              19951 non-null  object 
 2   genres             19951 non-null  object 
 3   original_language  19951 non-null  object 
 4   overview           19951 non-null  object 
 5   popularity         19951 non-null  float64
 6   keywords           19951 non-null  object 
 7   year               19951 non-null  int64  
 8   cast               19951 non-null  object 
 9   director           19951 non-null  object 
 10  score              19951 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 1.7+ MB


In [7]:
#Combine title, synopsis, and Genre
df['soup'] = df.apply(lambda row: f"Title: {row['title']}. Genres: {row['genres']}. Keywords: {row['keywords']}. Cast: {row['cast']}. Director: {row['director']}.", axis=1)
df['soup'][0]

'Title: Godzilla x Kong: The New Empire. Genres: Science Fiction Action Adventure. Keywords: giantmonster sequel dinosaur kaiju fantasyworld giantape godzilla kingkong mongkey. Cast: RebeccaHall BrianTyreeHenry DanStevens. Director: Adam Wingard.'

In [8]:
# pip install -U langchain
# pip install -U langchain-community

In [9]:
from langchain.docstore.document import Document

movies = []

for index, row in df.iterrows():
    x = Document(page_content=row['soup'], 
                 metadata={
                     "movie": row['title'],
                     "language": row['original_language'], 
                     "popularity": row['popularity'], 
                     "year": row['year'],
                     "synopsis": row['overview'],
                     "score": row['score']
                     })
    movies.append(x)

# doc =  Document(page_content="text", metadata={"source": "local"})

In [10]:
len(movies)

19951

In [11]:
import weaviate
from langchain.vectorstores import Weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore

In [12]:
import weaviate
from weaviate.auth import AuthApiKey

client = weaviate.connect_to_weaviate_cloud(
    cluster_url="",                       # `weaviate_url`: your Weaviate URL
    auth_credentials=AuthApiKey(""),      # `weaviate_key`: your Weaviate API key
)

In [13]:
from langchain_huggingface import HuggingFaceEmbeddings

# specify embedding model (using huggingface sentence transformer)
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name, 
  model_kwargs=model_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [14]:
df.columns

Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'keywords', 'year', 'cast', 'director', 'score', 'soup'],
      dtype='object')

In [15]:
vector_db = WeaviateVectorStore(
    client = client,
    index_name = "LangChain_2adbdf1cd46e49c0ae3561cd1aa07cbe",
    text_key = "text",
    embedding = embeddings
)

In [16]:
soups = pd.Series(df['soup'].values, index=df['title'])

In [17]:
soups.head()

title
Godzilla x Kong: The New Empire     Title: Godzilla x Kong: The New Empire. Genres...
Meg 2: The Trench                   Title: Meg 2: The Trench. Genres: Action Scien...
The Pope's Exorcist                 Title: The Pope's Exorcist. Genres: Horror Mys...
Transformers: Rise of the Beasts    Title: Transformers: Rise of the Beasts. Genre...
Dune: Part Two                      Title: Dune: Part Two. Genres: Science Fiction...
dtype: object

In [18]:
def get_recommendations(title):
    
    if title not in soups:
        raise ValueError(f"Title '{title}' not found in indices")
    
    query = soups.get(title)  
    
    results = vector_db.similarity_search(query, k=11)

    top_ten = []

    for x in results[1:]:
        movie_metadata = {
            'movie': x.metadata['movie'],
            'language': x.metadata['language'],
            'popularity': x.metadata['popularity'],
            'score': round(x.metadata['score'],1),
            'synopsis': x.metadata['synopsis'],
            'year': x.metadata['year']
        }
        top_ten.append(movie_metadata)

    df_top_ten = pd.DataFrame(top_ten)
    similarities = df_top_ten.sort_values(by=['score', 'popularity'], ascending=[False, False])[['movie', 'language','score','year']]

    return similarities, df_top_ten

In [19]:
similarities, result_df = get_recommendations('The Dark Knight Rises')
similarities

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Unnamed: 0,movie,language,score,year
0,The Dark Knight,English,8.5,2008.0
2,Batman Begins,English,7.7,2005.0
5,The Batman,English,7.6,2022.0
4,"Batman: The Dark Knight Returns, Part 2",English,7.5,2013.0
3,"Batman: The Dark Knight Returns, Part 1",English,7.3,2012.0
8,Batman,English,7.2,1989.0
1,Batman: The Dark Knight Returns,English,6.7,2013.0
6,Batman: Gotham Knight,English,6.6,2008.0
9,Batman: Gotham by Gaslight,English,6.6,2018.0
7,Knights of Badassdom,English,6.2,2013.0


## Generate Summaries

In [20]:
# pip install transformers peft accelerate bitsandbytes safetensors sentencepiece streamlit

In [21]:
# pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface transformers accelerate bitsandbytes llama-index-readers-web matplotlib flash-attn

In [22]:
# fixing unicode error in google colab
import locale

locale.getpreferredencoding = lambda: "UTF-8"

# import dependencies
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

### Quantizing the LLM

In [23]:
# function for loading 4-bit quantized model
def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
        device_map="auto"
    )
    return model

# function for initializing tokenizer
def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id

    return tokenizer

In [24]:
# specify model huggingface mode name
model_name = "microsoft/Phi-3-mini-4k-instruct"

# initialize tokenizer
tokenizer = initialize_tokenizer(model_name)
# load model
model = load_quantized_model(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.81s/it]


### Setting up the LLM for inference using Langchain

In [25]:
from langchain.llms import HuggingFacePipeline
from langchain import HuggingFacePipeline

In [26]:

# specify stop token ids
stop_token_ids = [0]

tokenizer.model_max_length = 2048

# build huggingface pipeline for using zephyr-7b-alpha
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=1,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    truncation=True
)

In [27]:
# specify the llm
llm = HuggingFacePipeline(pipeline=pipeline)

  warn_deprecated(


In [28]:
result_df.columns

Index(['movie', 'language', 'popularity', 'score', 'synopsis', 'year'], dtype='object')

In [64]:
# Define a prompt template
prompt_template = """
Based on the provided information write a summary of the information. Do not repeat the question in the output.
Movie: {movie}
Language: {language}
Popularity: {popularity}
Weighted score: {score}
Plot Overview: {synopsis}
Year: {year}
Summary: 
"""

def get_summary(movie: str, language: str, popularity: str, score: str, synopsis: str, year: str):
    prompt = prompt_template.format(movie=movie, 
                                    language=language, 
                                    popularity=popularity,
                                    score=score,
                                    synopsis=synopsis,
                                    year=year
                                    )
    response = llm(prompt)        
    return response.split('Summary:')[1].strip()

In [65]:
for index, row in result_df.iterrows():
    summary = get_summary(row.iloc[0], row.iloc[1], row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5])
    print(summary)

- Response: The Dark Knight, an English-language film released in 2008, is a popular superhero movie with a weighted score of 8.5. The plot revolves around Batman, aided by Lt. Jim Gordon and District Attorney Harvey Dent, as they combat crime in Gotham City. Their efforts lead to the dismantling of criminal organizations, but they become victims of chaos unleashed by the Joker, a notorious criminal mastermind. This thrilling story showcases Batman's determination to protect the city and its citizens from the Joker's reign of terror.
- Response: Batman: The Dark Knight Returns is an English-language movie released in 2013, featuring a popularity score of 15.758. The film revolves around the iconic character of Batman, who has not been active in crime-fighting for a decade. A new wave of criminals threatens Gotham City, prompting the aging Bruce Wayne to don his Batman persona once again. The movie explores whether the seasoned hero can adapt and continue his fight against crime in a ch

## Pipeline

In [140]:

class MovieRecommender:
    def __init__(self, soups, vector_db, llm):
        self.soups = soups
        self.vector_db = vector_db
        self.llm = llm


    def get_recommendations(self, title):
        
        if title not in self.soups:
            raise ValueError(f"Title '{title}' not found in indices")
        
        query = self.soups.get(title)  
        
        results = self.vector_db.similarity_search(query, k=11)

        top_ten = []

        for x in results[1:]:
            movie_metadata = {
                'movie': x.metadata['movie'],
                'language': x.metadata['language'],
                'popularity': x.metadata['popularity'],
                'score': round(x.metadata['score'],1),
                'synopsis': x.metadata['synopsis'],
                'year': x.metadata['year']
            }
            top_ten.append(movie_metadata)

        df_top_ten = pd.DataFrame(top_ten)
        df_top_ten.sort_values(by=['score', 'popularity'], ascending=[False, False], inplace=True)
        similarities = df_top_ten[['movie', 'language','score','year']]

        return similarities, df_top_ten

    def _get_summary(self, movie: str, language: str, popularity: str, score: str, synopsis: str, year: str):
        # Define a prompt template
        prompt_template = """
        Write a brief summary based on the providedinformation. Do not repeat the question in the output.
        Movie: {movie}
        Language: {language}
        Popularity: {popularity}
        Weighted score: {score}
        Plot Overview: {synopsis}
        Year: {year}
        Summary: 
        """
        
        prompt = prompt_template.format(movie=movie, 
                                        language=language, 
                                        popularity=popularity,
                                        score=score,
                                        synopsis=synopsis,
                                        year=year
                                        )
        response = llm(prompt)        
        # Extract the response part without unwanted tags
        if 'Summary:' in response:
            response = response.split('Summary:')[1].strip()
        
        # Remove specific response tags like "- [response]:"
        response = response.split("Explanation:")[-1].strip()
        
        return response
    
    def generate_summaries(self, df_top_ten):
        """
        Generate summaries for each row in the provided DataFrame.
        """
        summaries = []
        for _, row in df_top_ten.iterrows():
            summary = self._get_summary(
                row['movie'], row['language'], row['popularity'], row['score'], row['synopsis'], row['year']
            )
            summaries.append(summary)
        return summaries

In [141]:
recommender = MovieRecommender(soups, vector_db, llm)

In [142]:
title = "Interstellar"

In [143]:
similarities, df_top_ten = recommender.get_recommendations(title)
similarities

Unnamed: 0,movie,language,score,year
9,2001: A Space Odyssey,English,8.0,1968.0
1,The Martian,English,7.7,2015.0
5,Arrival,English,7.5,2016.0
8,Passengers,English,6.9,2016.0
0,Interstellar: Nolan's Odyssey,English,6.7,2014.0
4,Star Trek: The Motion Picture,English,6.5,1979.0
6,Millennium,English,6.3,1989.0
3,Capsule,English,6.3,2015.0
7,Approaching the Unknown,English,6.1,2016.0
2,Lost in Space,English,5.8,1998.0


In [144]:
summaries = recommender.generate_summaries(df_top_ten)
summaries

["2001: A Space Odyssey, released in 1968, is an English-language science fiction film that achieved a popularity score of 42.072. With a weighted score of 8.0, the movie follows humanity's quest to uncover the origins of a mysterious object discovered beneath the lunar surface. Assisted by HAL 9000, the most advanced supercomputer of its time, the story explores themes of human evolution, artificial intelligence, and the unknown depths of space.",
 'The Martian, an English-language film released in 2015, is a popular science fiction movie with a weighted score of 7.7. The story revolves around astronaut Mark Watney, who is presumed dead after a severe storm during a manned mission to Mars. However, Watney miraculously survives and becomes stranded on the inhospitable planet. With limited resources, he must rely on his ingenuity, wit, and spirit to survive and find a way to communicate with Earth that he is alive.',
 'Arrival, an English-language film released in 2016, is a science fic