In [15]:
# !pip install pandas
# !pip install openai
import pandas as pd
movies_df = pd.read_csv('src/data/raw_data/movies_metadata.csv', low_memory=False)
movies_df

up_movie = movies_df[movies_df['title'] == 'Up']
if not up_movie.empty:
    print("Found 'Up' in the dataset:")
    print(up_movie[['title', 'vote_average', 'overview']])
else:
    print("Movie 'Up' not found in the filtered dataset")

Found 'Up' in the dataset:
      title  vote_average                                           overview
13724    Up           7.8  Carl Fredricksen spent his entire life dreamin...


In [13]:
# ... existing code ...

movies_df['vote_average'] = pd.to_numeric(movies_df['vote_average'], errors='coerce')
movies_df['vote_count'] = pd.to_numeric(movies_df['vote_count'], errors='coerce')

# Filter for movies with more than 50 votes, then get top 12 by rating
movies_df = movies_df[movies_df['vote_count'] > 100].nlargest(12, 'vote_average')
movies_df

# Save filtered movies to CSV
output_path = 'src/data/raw_data/movies_metadata_filtered.csv'
movies_df.to_csv(output_path, index=False)
print(f"Saved filtered movies to {output_path}")


Saved filtered movies to src/data/raw_data/movies_metadata_filtered.csv


In [45]:
num_records = 10
def prep_for_embeddings(name: str, description: str, keywords_list: str) -> str:
    # Get keywords for this movie if they exist
    try:
        keywords_str = ""
        if isinstance(keywords_list, str):
            keywords_list = eval(keywords_list)
    except:
        keywords_list = []
    if keywords_list != []:
        print(name, keywords_list)
        keyword_names = [k['name'] for k in keywords_list]
        if keyword_names:
            keywords_str = f" Keywords: {', '.join(keyword_names)}."
    else:
        pass  # Skip if keywords can't be parsed
            
    return f'Title: {name}. Description: {description}{keywords_str}'

documents = list(map(
    prep_for_embeddings, 
    movies_df["title"][:num_records], 
    movies_df["overview"][:num_records],
    keywords_df["keywords"][:num_records]
))

Toy Story [{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]
Jumanji [{'id': 10090, 'name': 'board game'}, {'id': 10941, 'name': 'disappearance'}, {'id': 15101, 'name': "based on children's book"}, {'id': 33467, 'name': 'new home'}, {'id': 158086, 'name': 'recluse'}, {'id': 158091, 'name': 'giant insect'}]
Grumpier Old Men [{'id': 1495, 'name': 'fishing'}, {'id': 12392, 'name': 'best friend'}, {'id': 179431, 'name': 'duringcreditsstinger'}, {'id': 208510, 'name': 'old men'}]
Waiting to Exhale [{'id': 818, 'name': 'based on novel'}, {'id': 10131, 'name': 'interracial relationship'}, {'id': 14768, 'name': 'single mother'}, {'id': 15160, 'name': 'divorce'}, {'id': 33455, 'name': 'chick flick'}]
Father of the Bride Part II [{'id':

In [46]:
for i in documents:
    print(i)

Title: Toy Story. Description: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Keywords: jealousy, toy, boy, friendship, friends, rivalry, boy next door, new toy, toy comes to life.
Title: Jumanji. Description: When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures. Keywords: board game, disappearance, based on children's book, new home, recluse, giant insect.
Title: Grumpier Old Men. Description: A f

In [4]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

In [5]:
import numpy as np
from openai import OpenAI
from typing import List
import os

def get_embeddings_and_scores(queries: List[str], documents: List[str]) -> tuple:
    """
    Get embeddings using OpenAI's API and compute similarity scores.

    Args:
        queries: List of query strings
        documents: List of document strings

    Returns:
        tuple: (embeddings array, similarity scores matrix)
    """
    api_key = os.environ.get('OPENAI_API_KEY')

    client = OpenAI(api_key=api_key)

    # Get embeddings for all texts
    all_texts = queries + documents
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=all_texts,
        encoding_format="float"
    )

    # Extract embeddings from response
    embeddings = np.array([e.embedding for e in response.data])

    # Split into query and document embeddings
    query_embeddings = embeddings[:len(queries)]
    doc_embeddings = embeddings[len(queries):]

    # Compute similarity scores (same as your original computation)
    # Note: embeddings from OpenAI are already normalized
    scores = (query_embeddings @ doc_embeddings.T) * 100

    return embeddings, scores.tolist()

# Example usage:
task = 'Given a movie query, analyze the plot elements and themes to retrieve relevant movie names and descriptions that match the query'
searches = [
    "A children's animated movie about toys coming to life, perfect for family viewing",
    "An adventure movie featuring dangerous wild animals and a magical board game",
    "A comedy about elderly neighbors, fishing, and romance"
]
queries = list(map(get_detailed_instruct, [task]*3, searches))


embeddings, scores = get_embeddings_and_scores(queries, documents)
for row in scores:
    print(documents[np.argmax(row)])

Title: Toy Story. Description: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Title: Jumanji. Description: When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.
Title: Grumpier Old Men. Description: A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the loc

In [None]:
# need to pass into claude 3.5 haiku to convert these into better prompts. Pay special attention to keywords
# filter by genre if genre detected
# add keywords to prep for embeddings function

# add perplexity like interpretability

searches2 = [
    "Animated fun movie",
    "An action comedy with animals",
    "A thriller about parental love and hockey"
]
queries = list(map(get_detailed_instruct, [task]*3, searches2))


embeddings, scores = get_embeddings_and_scores(queries, documents)
for row in scores:
    top_3_indices = np.argsort(row)[-3:][::-1]
    for idx in top_3_indices:
        print(idx, documents[idx])
    print("---")

124 Title: The Neverending Story III: Escape from Fantasia. Description: A young boy must restore order when a group of bullies steal the magical book that acts as a portal between Earth and the imaginary world of Fantasia.
28 Title: The City of Lost Children. Description: A scientist in a surrealist society kidnaps children to steal their dreams, hoping that they slow his aging process.
0 Title: Toy Story. Description: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
---
292 Title: Pulp Fiction. Description: A burger-loving hit man, his philosophical partner, a drug-addled gangster's moll and a washed-up boxer converge in this sprawling, comedic crime caper. Their adventures unfurl in three stories that ingeniously trip back and forth

In [None]:
# break into two functions: fetch document embeddings, or generate if they don't exist
# call separate api function first to improve prompt with more details
# filter by genre if necessary
# similarity search (improved prompt, documents) -> return top 3 movie objects
# update progress component on main page with each step
"""
import { OpenAI } from 'openai';
import { NextResponse } from 'next/server';
import { dot } from 'mathjs'; // You'll need to install this: npm install mathjs

const client = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY // Move API key to environment variable
});

function prepForEmbeddings(name: string, description: string): string {
  return `Title: ${name}. Description: ${description}`;
}

function getDetailedInstruct(taskDescription: string, query: string): string {
  return `Instruct: ${taskDescription}\nQuery: ${query}`;
}

export async function POST(req: Request) {
  try {
    const { search, documents } = await req.json();
    
    const task = 'Given a movie query, analyze the plot elements and themes to retrieve relevant movie names and descriptions that match the query';
    const query = getDetailedInstruct(task, search);

    // Get embeddings for query and documents
    const response = await client.embeddings.create({
      model: "text-embedding-ada-002",
      input: [query, ...documents],
      encoding_format: "float"
    });

    // Extract embeddings
    const embeddings = response.data.map(e => e.embedding);
    const queryEmbedding = embeddings[0];
    const docEmbeddings = embeddings.slice(1);

    // Calculate similarity scores
    const scores = docEmbeddings.map(docEmb => 
      dot(queryEmbedding, docEmb) * 100
    );

    // Get top 3 results
    const top3Indices = scores
      .map((score, idx) => ({ score, idx }))
      .sort((a, b) => b.score - a.score)
      .slice(0, 3)
      .map(item => item.idx);

    const results = top3Indices.map(idx => ({
      index: idx,
      document: documents[idx],
      score: scores[idx]
    }));

    return NextResponse.json({ results });
    
  } catch (error) {
    console.error('Error:', error);
    return NextResponse.json(
      { error: 'Failed to process request' },
      { status: 500 }
    );
  }
}
"""

In [44]:
"""
'use client';

import { useState } from 'react';

export default function MovieSearch() {
  const [results, setResults] = useState([]);
  const [search, setSearch] = useState('');

  const handleSearch = async () => {
    try {
      const response = await fetch('/api/movie-search', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
        },
        body: JSON.stringify({
          search,
          documents: // Your documents array here
        }),
      });

      const data = await response.json();
      setResults(data.results);
    } catch (error) {
      console.error('Error:', error);
    }
  };

  return (
    <div>
      <input
        type="text"
        value={search}
        onChange={(e) => setSearch(e.target.value)}
        placeholder="Search movies..."
      />
      <button onClick={handleSearch}>Search</button>
      
      <div>
        {results.map((result, idx) => (
          <div key={idx}>
            <p>{result.document}</p>
            <p>Score: {result.score.toFixed(2)}</p>
          </div>
        ))}
      </div>
    </div>
  );
}
"""

'\n\'use client\';\n\nimport { useState } from \'react\';\n\nexport default function MovieSearch() {\n  const [results, setResults] = useState([]);\n  const [search, setSearch] = useState(\'\');\n\n  const handleSearch = async () => {\n    try {\n      const response = await fetch(\'/api/movie-search\', {\n        method: \'POST\',\n        headers: {\n          \'Content-Type\': \'application/json\',\n        },\n        body: JSON.stringify({\n          search,\n          documents: // Your documents array here\n        }),\n      });\n\n      const data = await response.json();\n      setResults(data.results);\n    } catch (error) {\n      console.error(\'Error:\', error);\n    }\n  };\n\n  return (\n    <div>\n      <input\n        type="text"\n        value={search}\n        onChange={(e) => setSearch(e.target.value)}\n        placeholder="Search movies..."\n      />\n      <button onClick={handleSearch}>Search</button>\n      \n      <div>\n        {results.map((result, idx) => (