In [27]:
import requests
import urllib.parse
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
api_key="8265bd1679663a7ea12ac168da84d2e8"

In [3]:
def get_movie_id(movie_name):
    url = "https://api.themoviedb.org/3/search/movie?api_key=" + api_key + "&query=" + urllib.parse.quote(movie_name)
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        movies = [m for m in data["results"] if m["title"].lower() == movie_name.lower()]
        movies.sort(key=lambda x: x["popularity"], reverse=True)
        return movies[0]["id"]
    else:
        return None

In [4]:
def get_movie_data(movie_name=None, movie_id=None):
    if movie_id:  # Fetch by movie ID
        movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    elif movie_name:  # Fetch by movie name
        search_url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={urllib.parse.quote(movie_name)}"
        search_response = requests.get(search_url)
        if search_response.status_code == 200:
            movies = [m for m in search_response.json()['results'] if m['title'].lower() == movie_name.lower()]
            if not movies:
                return None
            movies.sort(key=lambda x: x['popularity'], reverse=True)
            movie_url = f"https://api.themoviedb.org/3/movie/{movies[0]['id']}?api_key={api_key}"
        else:
            return None
    else:
        return None  # If neither movie_name nor movie_id is provided

    response = requests.get(movie_url)
    return response.json() if response.status_code == 200 else None


In [5]:
def get_movie_genre(movie_id, genre_id = False, session=None):
    url = "https://api.themoviedb.org/3/movie/" + str(movie_id) + "?api_key=" + api_key
    req = session if session else requests
    response = req.get(url)
    if response.status_code == 200:
        data = response.json()["genres"]
        if genre_id:
            gerere_id_list = [genre["id"] for genre in data]
            return gerere_id_list
        else:
            gerere_name_list = [genre["name"] for genre in data]
            return gerere_name_list
    else:
        return None

In [6]:
def genre_id_to_name(genre_id):
    url = "https://api.themoviedb.org/3/genre/movie/list?api_key=" + api_key
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()["genres"]
        gerere_name_list = [genre["name"] for genre in data if genre["id"] in genre_id]
        return gerere_name_list
    else:
        return None

In [7]:
def get_movie_keywords(movie_id):
    url = "https://api.themoviedb.org/3/movie/" + str(movie_id) + "/keywords?api_key=" + api_key
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()["keywords"]
        keywords = [keyword["name"] for keyword in data]
        return keywords
    else:
        return None

In [8]:
def get_movie_credits(movie_id, session=None):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?api_key={api_key}"
    req = session if session else requests
    response = req.get(url)
    
    if response.status_code == 200:
        data = response.json()
        directors = [crew['id'] for crew in data['crew'] if crew['job'] == 'Director'][:3]
        writers = [crew['id'] for crew in data['crew'] if crew['job'] in ['Writer', 'Screenplay']][:3]
        actors = [actor['id'] for actor in data['cast'][:5]]

        return {
            "Directors": directors,
            "Writers": writers,
            "Actors": actors
        }
    
    return None

In [9]:
def id_to_name(person_id):
    url = f"https://api.themoviedb.org/3/person/{person_id}?api_key={api_key}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return data['name']
    
    return None

In [10]:
def find_related_movies(movie_id, session=None):
    req = session if session else requests
    movie_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    recommendations_url = f"https://api.themoviedb.org/3/movie/{movie_id}/recommendations?api_key={api_key}&sort_by=popularity.desc"
    movie_response = req.get(movie_url)
    recommendations_response = req.get(recommendations_url)

    if movie_response.status_code == 200:
        movie_data = movie_response.json()
        collection = movie_data.get('belongs_to_collection')

        if collection:  # If movie belongs to a collection
            collection_id = collection['id']
            collection_url = f"https://api.themoviedb.org/3/collection/{collection_id}?api_key={api_key}"
            collection_response = requests.get(collection_url)
            if collection_response.status_code == 200:
                collection_movies = [m for m in collection_response.json().get('parts', [])]
                collection_movies = sorted(collection_movies, key=lambda x: x['popularity'], reverse=True)
                collection_movies = [cm['id'] for cm in collection_movies if cm['id'] != movie_id][:5]
            else:
                collection_movies = []
        else:
            collection_movies = []

    if recommendations_response.status_code == 200:
        recommendations = [m['id'] for m in recommendations_response.json().get('results', [])][:5]
    else:
        recommendations = []
    
    related_movies = list(set(collection_movies + recommendations))
    return related_movies

In [11]:
def search_movies(actor_id=None, director_id=None, writer_id=None, genre_id=None, limit=15):
    url = "https://api.themoviedb.org/3/discover/movie?api_key=" + api_key
    params = {}
    
    if actor_id:
        params['with_cast'] = actor_id
    if director_id or writer_id:
        params['with_crew'] = ','.join(filter(None, [str(director_id), str(writer_id)]))
    if genre_id:
        params['with_genres'] = genre_id
    params['sort_by'] = 'popularity.desc'
    
    response = requests.get(url, params=params)

    if response.status_code == 200:
        return [movie['id'] for movie in response.json().get("results", [])[:limit]]
    return None
    
    

In [12]:
# def find_similar_movies(movie_ids):
#     similar_movies = set()
#     genres = set()
#     directors = set()
#     writers = set()
#     actors = set()
#     for movie_id in movie_ids:
#         genres.update(get_movie_genre(movie_id, genre_id=True))
#         movies = find_related_movies(movie_id)
#         similar_movies.update(movies)
#         credits = get_movie_credits(movie_id)
#         if credits:
#             directors.update(credits["Directors"])
#             writers.update(credits["Writers"])
#             actors.update(credits["Actors"])
#     for genre in genres:
#         similar_movies.update(search_movies(genre_id=genre)[:10])
#     for director in directors:
#         similar_movies.update(search_movies(director_id=director)[:5])
#     for writer in writers:
#         similar_movies.update(search_movies(writer_id=writer)[:5])
#     for actor in actors:
#         similar_movies.update(search_movies(actor_id=actor)[:5])
    
#     for movie_id in similar_movies.copy():
#         if movie_id in movie_ids:
#             similar_movies.remove(movie_id)
#     return list(similar_movies)
            
    


In [13]:
import concurrent.futures
import functools
import requests
import urllib.parse
from typing import List, Dict, Optional, Any
import time

# Add caching decorator
def cache_result(ttl_seconds=3600):
    cache = {}
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            key = str(args) + str(kwargs)
            current_time = time.time()
            if key in cache and current_time - cache[key]['timestamp'] < ttl_seconds:
                return cache[key]['result']
            result = func(*args, **kwargs)
            cache[key] = {'result': result, 'timestamp': current_time}
            return result
        return wrapper
    return decorator

# Example of parallelized find_similar_movies
def find_similar_movies(movie_ids: List[int]) -> List[int]:
    similar_movies = set()
    
    # Create a session for connection pooling
    session = requests.Session()
    
    # Function to get all data for a single movie
    @cache_result(ttl_seconds=86400)  # Cache for 24 hours
    def get_movie_all_data(movie_id: int) -> Dict:
        results = {}
        results['genres'] = get_movie_genre(movie_id, genre_id=True, session=session)
        results['related'] = find_related_movies(movie_id, session=session)
        results['credits'] = get_movie_credits(movie_id, session=session)
        return results
    
    # Parallel fetch data for all movies
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        all_data = list(executor.map(get_movie_all_data, movie_ids))
    
    # Process the collected data
    genres = set()
    directors = set()
    writers = set()
    actors = set()
    
    for data in all_data:
        genres.update(data['genres'] or [])
        similar_movies.update(data['related'] or [])
        if data['credits']:
            directors.update(data['credits'].get("Directors", []))
            writers.update(data['credits'].get("Writers", []))
            actors.update(data['credits'].get("Actors", []))
    
    # Parallel search for similar movies by different criteria
    search_tasks = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        for genre in genres:
            search_tasks.append(executor.submit(search_movies, genre_id=genre, limit=20))
        for director in directors:
            search_tasks.append(executor.submit(search_movies, director_id=director, limit=6))
        for writer in writers:
            search_tasks.append(executor.submit(search_movies, writer_id=writer, limit=6))
        for actor in actors:
            search_tasks.append(executor.submit(search_movies, actor_id=actor, limit=5))
    
    # Collect results
    for task in concurrent.futures.as_completed(search_tasks):
        result = task.result()
        if result:
            similar_movies.update(result)
    
    # Remove original movies
    similar_movies = similar_movies - set(movie_ids)
    
    return list(similar_movies)


In [28]:
def get_combined_keywords(movie_id: int, api_key: str=api_key) -> list:
    """Returns combined keywords from TMDB and Wikidata with optimizations"""
    # Fetch TMDB data with combined request
    tmdb_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&append_to_response=keywords"
    
    try:
        # Single TMDB API call with connection pooling
        with requests.Session() as session:
            response = session.get(tmdb_url, timeout=10)
            if response.status_code != 200:
                return None
            data = response.json()
            
            # Extract keywords and basic movie info
            tmdb_keywords = [k["name"] for k in data.get("keywords", {}).get("keywords", [])]
            release_year = data.get('release_date', '')[:4] if data.get('release_date') else None

    except requests.exceptions.RequestException as e:
        print(f"TMDB API Error: {str(e)}")
        return None

    # Wikidata query using TMDB ID directly
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT DISTINCT ?propertyLabel WHERE {{
      ?film wdt:P4947 "{movie_id}";
            (wdt:P136|wdt:P921) ?property.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    
    wikidata_props = []
    max_retries = 3
    for attempt in range(max_retries + 1):
        try:
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            wikidata_props = [result["propertyLabel"]["value"] 
                            for result in results["results"]["bindings"]]
            break
        except Exception as e:
            if '429' in str(e) and attempt < max_retries:
                time.sleep(1 + attempt)
                continue
            print(f"Wikidata Error: {str(e)}")
            break

    # Clean Wikidata keywords
    def clean_keyword(kw: str) -> str:
        # Remove standalone 'film' but preserve hyphenated terms
        return ' '.join(
            word for word in kw.split() 
            if word.lower() != 'film' and not (word == 'Film' and '-' not in kw)
        ).strip()

    wikidata_props = [clean_keyword(kw) for kw in wikidata_props]
    wikidata_props = [kw for kw in wikidata_props if kw]  # Remove empty strings

    # Merge and deduplicate (only use TMDB if Wikidata failed)
    return list(set(tmdb_keywords + wikidata_props))


In [29]:
keywords = get_combined_keywords(550)

In [30]:
print(keywords)

['nihilism', 'self destructiveness', 'dissociative identity disorder', 'based on novel or book', 'based on a novel', 'dual identity', 'drama', 'breaking the fourth wall', 'flashback', 'alter ego', 'insomnia', 'support group', 'split personality', 'thriller', 'dystopia', 'fight', 'rage and hate', 'psychological thriller', 'quitting a job']


In [14]:
# get_movie_credits(movie_id=603, session=None)

In [15]:
# movies = search_movies(actor_id=6384)
# print(movies)
# for movie in movies:
#     print(get_movie_data(movie_id=movie)["title"])
#     # print(get_movie_genre(movie, genre_id=True))
#     # print(get_movie_credits(movie))
#     # print(get_movie_keywords(movie))


In [16]:
# print(get_movie_keywords(603))

In [17]:
# genre_id_to_name([53, 28, 12, 878])

In [18]:
movies = find_similar_movies_parallel([603, 1726,59967])
# for movie in movies:
#     print(get_movie_data(movie_id=movie)["title"])

NameError: name 'find_similar_movies_parallel' is not defined

In [None]:
# len(movies)