## Install Dependencies

In [1]:
# Install required packages
!pip install openai pandas numpy faiss-cpu python-dotenv -q

print("All dependencies installed successfully!")

All dependencies installed successfully!



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Set OpenAI API Key

**Important**: Replace `'api-key'` with actual OpenAI API key.

In [None]:
import os

# Set OpenAI API key here
os.environ["OPENAI_API_KEY"] = "api-key"
print("OpenAI API key set successfully!")


OpenAI API key set successfully!


## Import Libraries

Import all necessary libraries for the RAG system.

In [7]:
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Any
import faiss
from openai import OpenAI

print("All libraries imported successfully!")

All libraries imported successfully!


## Define the MovieRAG Class

This is the core implementation of our RAG system. The `MovieRAG` class handles:
- Data loading and preprocessing
- Text chunking
- Embedding generation
- Vector store creation
- Semantic retrieval
- Answer generation with reasoning

In [22]:
class MovieRAG:
    """This RAG system for querying movie plot information."""
    
    def __init__(self, openai_api_key: str = None):
        """Initialize the RAG system with OpenAI client."""
        self.api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
        if not self.api_key:
            raise ValueError("OpenAI API key is required. Set OPENAI_API_KEY environment variable.")
        
        self.client = OpenAI(api_key=self.api_key)
        self.chunks = []
        self.index = None
        self.embedding_dim = 1536  # OpenAI text-embedding-3-small dimension
        
    def load_and_preprocess_data(self, csv_path: str, num_rows: int = 500) -> pd.DataFrame:
        """Load and preprocess movie plot data."""
        print(f"Loading {num_rows} movies from dataset...")
        
        # Load subset of data
        df = pd.read_csv(csv_path, nrows=num_rows)
        
        # Keep only Title and Plot columns
        df = df[['Title', 'Plot', 'Release Year', 'Director']].copy()
        
        # Remove rows with missing plots
        df = df.dropna(subset=['Plot'])
        
        # Clean plot text
        df['Plot'] = df['Plot'].str.strip()
        
        print(f"Loaded {len(df)} movies with valid plots")
        return df
    
    def chunk_text(self, text: str, max_words: int = 300) -> List[str]:
        """Split text into chunks of approximately max_words."""
        words = text.split()
        chunks = []
        
        for i in range(0, len(words), max_words):
            chunk = ' '.join(words[i:i + max_words])
            chunks.append(chunk)
        
        return chunks
    
    def create_chunks(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
        """Create chunks from movie plots with metadata."""
        print("Creating text chunks...")
        chunks = []
        
        for idx, row in df.iterrows():
            title = row['Title']
            plot = row['Plot']
            year = row['Release Year']
            director = row['Director']
            
            # Chunk long plots
            plot_chunks = self.chunk_text(plot, max_words=300)
            
            for chunk_idx, chunk in enumerate(plot_chunks):
                chunks.append({
                    'title': title,
                    'year': year,
                    'director': director,
                    'text': chunk,
                    'chunk_id': f"{title}_{chunk_idx}",
                    'full_plot': plot
                })
        
        print(f"Created {len(chunks)} chunks from {len(df)} movies")
        return chunks
    
    def get_embedding(self, text: str) -> np.ndarray:
        """Get embedding for a text using OpenAI API."""
        response = self.client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return np.array(response.data[0].embedding)
    
    def build_vector_store(self, chunks: List[Dict[str, Any]]):
        """Build FAISS vector store from chunks."""
        print("Generating embeddings and building vector store...")
        self.chunks = chunks
        
        # Generate embeddings
        embeddings = []
        for i, chunk in enumerate(chunks):
            if i % 50 == 0:
                print(f"   Processing chunk {i}/{len(chunks)}...")
            embedding = self.get_embedding(chunk['text'])
            embeddings.append(embedding)
        
        # Convert to numpy array
        embeddings_array = np.array(embeddings).astype('float32')
        
        # Build FAISS index
        self.index = faiss.IndexFlatL2(self.embedding_dim)
        self.index.add(embeddings_array)
        
        print(f"Vector store built with {self.index.ntotal} vectors")
    
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
        """Retrieve top-k relevant chunks for a query."""
        # Get query embedding
        query_embedding = self.get_embedding(query).reshape(1, -1).astype('float32')
        
        # Search in FAISS
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Get corresponding chunks
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            chunk = self.chunks[idx].copy()
            chunk['relevance_score'] = float(distance)
            results.append(chunk)
        
        return results
    
    def generate_answer(self, query: str, contexts: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Generate answer using LLM with retrieved contexts."""
        # Prepare context string
        context_str = "\n\n".join([
            f"Movie: {ctx['title']} ({ctx['year']})\nDirector: {ctx['director']}\nPlot: {ctx['text']}"
            for ctx in contexts
        ])
        
        # Create prompt
        prompt = f"""You are a movie expert assistant. Answer the user's question based on the provided movie plot information.

Context (Retrieved Movie Plots):
{context_str}

Question: {query}

Instructions:
1. Answer the question naturally and conversationally
2. Reference specific movies when relevant
3. If the context doesn't contain enough information, say so
4. Be concise but informative

Answer:"""
        
        # Call OpenAI
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful movie expert assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=500
        )
        
        answer = response.choices[0].message.content
        
        # Generate reasoning
        reasoning = self._generate_reasoning(query, contexts, answer)
        
        # Format output
        output = {
            "answer": answer,
            "contexts": [
                f"{ctx['title']} ({ctx['year']}): {ctx['text'][:200]}..."
                for ctx in contexts
            ],
            "reasoning": reasoning
        }
        
        return output
    
    def _generate_reasoning(self, query: str, contexts: List[Dict[str, Any]], answer: str) -> str:
        """Generate reasoning explanation."""
        movie_titles = [ctx['title'] for ctx in contexts]
        reasoning = (
            f"To answer the query about '{query}', I searched the movie plot database "
            f"and retrieved {len(contexts)} relevant plot chunks. "
            f"The most relevant movies found were: {', '.join(movie_titles[:3])}. "
            f"I used these plot summaries to construct a comprehensive answer."
        )
        return reasoning
    
    def query(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        """Main query method: retrieve + generate."""
        print(f"\n Query: {question}")
        print("=" * 80)
        
        # Retrieve relevant contexts
        contexts = self.retrieve(question, top_k=top_k)
        
        # Generate answer
        result = self.generate_answer(question, contexts)
        
        return result

print("MovieRAG class defined successfully!")

MovieRAG class defined successfully!


## Initialize the RAG System

Create an instance of the MovieRAG class and load the dataset.

In [9]:
# Initialize RAG system
print("Initializing Movie RAG System...\n")
rag = MovieRAG()

print("RAG system initialized!")

Initializing Movie RAG System...

RAG system initialized!


## Load and Preprocess Data

Load 500 movies from the Wikipedia Movie Plots dataset.

In [24]:
# Load data
csv_path = "wiki_movie_plots_deduped.csv"
df = rag.load_and_preprocess_data(csv_path, num_rows=500)

# Display sample
print("\n Sample of loaded data:")
print(df[['Title', 'Release Year', 'Director']].head(10))

Loading 500 movies from dataset...
âœ… Loaded 500 movies with valid plots

 Sample of loaded data:
                              Title  Release Year  \
0            Kansas Saloon Smashers          1901   
1     Love by the Light of the Moon          1901   
2           The Martyred Presidents          1901   
3  Terrible Teddy, the Grizzly King          1901   
4            Jack and the Beanstalk          1902   
5               Alice in Wonderland          1903   
6           The Great Train Robbery          1903   
7                   The Suburbanite          1904   
8          The Little Train Robbery          1905   
9        The Night Before Christmas          1905   

                             Director  
0                             Unknown  
1                             Unknown  
2                             Unknown  
3                             Unknown  
4  George S. Fleming, Edwin S. Porter  
5                      Cecil Hepworth  
6                     Edwin S. Porter

## Create Text Chunks

Split movie plots into chunks of approximately 300 words each.

In [13]:
# Create chunks
chunks = rag.create_chunks(df)

# Display sample chunk
print("\nSample chunk:")
print(f"Title: {chunks[0]['title']}")
print(f"Year: {chunks[0]['year']}")
print(f"Text: {chunks[0]['text'][:200]}...")

Creating text chunks...
Created 616 chunks from 500 movies

Sample chunk:
Title: Kansas Saloon Smashers
Year: 1901
Text: A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man...


## Build Vector Store

Generate embeddings for all chunks and build the FAISS index.

In [14]:
# Build vector store
rag.build_vector_store(chunks)

print("\n Vector store is ready for queries!")

Generating embeddings and building vector store...
   Processing chunk 0/616...
   Processing chunk 50/616...
   Processing chunk 100/616...
   Processing chunk 150/616...
   Processing chunk 200/616...
   Processing chunk 250/616...
   Processing chunk 300/616...
   Processing chunk 350/616...
   Processing chunk 400/616...
   Processing chunk 450/616...
   Processing chunk 500/616...
   Processing chunk 550/616...
   Processing chunk 600/616...
Vector store built with 616 vectors

 Vector store is ready for queries!


## Run Example Queries

Now let's test the system with example queries!

### Query 1: Artificial Intelligence

In [15]:
# Query about AI
result1 = rag.query("What movies feature artificial intelligence or robots?")

print("\n RESULT:")
print(json.dumps(result1, indent=2))


 Query: What movies feature artificial intelligence or robots?

 RESULT:
{
  "answer": "Based on the provided movie plots, none of them specifically feature artificial intelligence or robots. \"The Electric House\" does involve gadgets and electrical contraptions, but it does not delve into the realm of AI or robots in the way that might be typical in more modern films. If you're looking for movies that prominently feature artificial intelligence or robots, you might consider titles like \"Blade Runner,\" \"Ex Machina,\" or \"The Matrix.\" Let me know if you need information on those or any other films!",
  "contexts": [
    "The Electric House (1922): Keaton plays a botany student who is accidentally awarded an electrical engineering degree. He then attempts to wire a home using many gadgets. The man to whom the degree should have been awarded then...",
    "The Playhouse (1921): The film is set up as a series of humorous tricks on the audience, with constant doubling, and in which t

### Query 2: Space Exploration

In [16]:
# Query about space
result2 = rag.query("Are there any movies about space exploration?")

print("\n RESULT:")
print(json.dumps(result2, indent=2))


 Query: Are there any movies about space exploration?

 RESULT:
{
  "answer": "It looks like the provided movie plots don't include any films specifically about space exploration. The movies mentioned\u2014like \"The Balloonatic,\" \"Bumping Into Broadway,\" and \"Dangerous Hours\"\u2014focus on different themes such as comedy, romance, and political intrigue. If you're looking for films related to space exploration, you might want to check out classics like \"2001: A Space Odyssey\" or more recent titles like \"Interstellar\" and \"The Martian.\" Let me know if you need information on those or any other films!",
  "contexts": [
    "The Balloonatic (1923): A young man (Keaton) has a series of encounters in an amusement area, much like Coney Island, until happening upon a group of men preparing a hot air balloon for launch. The young man assists the grou...",
    "Bumping Into Broadway (1919): The film opens with a quick glimpse into the glamorous life of Broadway and the hubris often

### Query 3: Love Stories

In [17]:
# Query about romance
result3 = rag.query("Tell me about movies with love stories or romantic themes")

print("\n RESULT:")
print(json.dumps(result3, indent=2))


 Query: Tell me about movies with love stories or romantic themes

 RESULT:
{
  "answer": "There are several classic movies featuring love stories or romantic themes that capture the complexities of relationships. \n\nOne notable film is **\"Romance\" (1920)**, directed by Chester Withey. It tells the story of a priest who falls in love with an Italian opera singer. The film explores the tension between his spiritual aspirations and his romantic feelings, culminating in a poignant ending that emphasizes the emotional depth of love.\n\nAnother great example is **\"Bumping Into Broadway\" (1919)**, directed by Hal Roach. This film follows a struggling playwright and an aspiring actress who both face challenges in the glamorous yet tough world of Broadway. Their story is filled with humor and romance, as the playwright goes to great lengths to support the actress, leading to a sweet reunion at the end.\n\nLastly, **\"Amarilly of Clothes-Line Alley\" (1918)**, directed by Marshall Neilan,

### Query 4 is custom made without looking at dataset included data

In [18]:
# Query about horror
custom_query = "What horror or suspenseful movies are in the dataset?"

result = rag.query(custom_query)

print("\n RESULT:")
print(json.dumps(result, indent=2))


 Query: What horror or suspenseful movies are in the dataset?

 RESULT:
{
  "answer": "Based on the provided movie plots, the film that fits into the horror or suspenseful genre is **A Blind Bargain (1922)**. This movie involves a mad scientist, Dr. Lamb, who manipulates a young man, Robert, into agreeing to an experimental operation after he helps Robert's ailing mother. The atmosphere is filled with tension and fear, especially as Robert learns about the sinister nature of Dr. Lamb's experiments and the presence of strange prisoners, creating a suspenseful narrative.\n\nThe other movies mentioned, such as **The Other Side of the Door (1916)** and **Haunted Spooks (1920)**, do not seem to fit the horror or suspense categories as they focus more on drama and comedy, respectively. So, **A Blind Bargain** is the standout film for horror or suspense in this dataset.",
  "contexts": [
    "A Blind Bargain (1922): The film is a contemporary (1920s, though the book was published in 1897) pi

## Analyze Results

the structure of our output.

In [19]:
# Analyze a result
print("Output Structure Analysis:")
print("=" * 80)
print(f"\n1 ANSWER (Natural Language):")
print(f"   {result1['answer']}")
print(f"\n2 CONTEXTS (Retrieved Chunks): {len(result1['contexts'])} chunks")
for i, ctx in enumerate(result1['contexts'], 1):
    print(f"   Context {i}: {ctx[:100]}...")
print(f"\n3 REASONING (Explanation):")
print(f"   {result1['reasoning']}")

Output Structure Analysis:

1 ANSWER (Natural Language):
   Based on the provided movie plots, none of them specifically feature artificial intelligence or robots. "The Electric House" does involve gadgets and electrical contraptions, but it does not delve into the realm of AI or robots in the way that might be typical in more modern films. If you're looking for movies that prominently feature artificial intelligence or robots, you might consider titles like "Blade Runner," "Ex Machina," or "The Matrix." Let me know if you need information on those or any other films!

2 CONTEXTS (Retrieved Chunks): 3 chunks
   Context 1: The Electric House (1922): Keaton plays a botany student who is accidentally awarded an electrical e...
   Context 2: The Playhouse (1921): The film is set up as a series of humorous tricks on the audience, with consta...
   Context 3: The Haunted House (1921): Keaton plays a teller at a successful bank. Unbeknownst to him, the manage...

3 REASONING (Explanation):
  

## Inspect Retrieval Quality

How well the retrieval system is working.

In [20]:
# Test retrieval directly
test_query = "movies with time travel"
print(f" Testing retrieval for: '{test_query}'\n")

retrieved = rag.retrieve(test_query, top_k=5)

print(f"Retrieved {len(retrieved)} contexts:\n")
for i, chunk in enumerate(retrieved, 1):
    print(f"{i}. {chunk['title']} ({chunk['year']})")
    print(f"   Score: {chunk['relevance_score']:.4f}")
    print(f"   Text: {chunk['text'][:150]}...\n")

 Testing retrieval for: 'movies with time travel'

Retrieved 5 contexts:

1. The Ghost of Slumber Mountain (1918)
   Score: 1.3290
   Text: Most of the full plot is unknown. In the version available today, Holmes (Dawley) tells his nephews and children about an adventure he had in the wood...

2. Bound in Morocco (1918)
   Score: 1.3408
   Text: As described in a film magazine,[3] George Travelwell (Fairbanks), an American youth motoring in Morocco, discovers that the governor of El Harib (Cam...

3. Number, Please? (1920)
   Score: 1.3543
   Text: While at an amusement park, trying vainly to forget the girl he has lost, a young man (Lloyd) sees the girl (Mildred Davis) with her new boyfriend (Ro...

4. The Immigrant (1917)
   Score: 1.3728
   Text: The film begins aboard a steamer crossing the Atlantic Ocean, and initially showcases the misadventures of an unnamed immigrant, the Tramp (Chaplin) w...

5. A Blind Bargain (1922)
   Score: 1.3753
   Text: The film is a contemporary (1920s

## System Statistics

View statistics about the RAG system.

In [21]:
# System statistics
print(" RAG SYSTEM STATISTICS")
print("=" * 80)
print(f" Movies loaded: {len(df)}")
print(f" Total chunks: {len(rag.chunks)}")
print(f" Vectors in index: {rag.index.ntotal}")
print(f" Embedding dimension: {rag.embedding_dim}")
print(f" Default top-k: 3")
print(f" Average words per chunk: ~300")
print(f"\n Models Used:")
print(f"   Embeddings: text-embedding-3-small")
print(f"   Generation: gpt-4o-mini")
print(f"   Vector Store: FAISS (IndexFlatL2)")

 RAG SYSTEM STATISTICS
 Movies loaded: 500
 Total chunks: 616
 Vectors in index: 616
 Embedding dimension: 1536
 Default top-k: 3
 Average words per chunk: ~300

 Models Used:
   Embeddings: text-embedding-3-small
   Generation: gpt-4o-mini
   Vector Store: FAISS (IndexFlatL2)
