# Count number of movies

In [None]:
import requests

BASE_URL = "https://query.wikidata.org/sparql"
headers = {
    "User-Agent": "MoviesAnalysis/1.0 (martinsaski@gmail.com)",
    "Accept": "application/json"
}

query = """
SELECT (COUNT(DISTINCT ?movie) AS ?totalMovies)
WHERE {
  ?movie wdt:P31/wdt:P279* wd:Q11424;
         wdt:P577 ?releaseDate.
}
"""

params = {'query': query, 'format': 'json'}
response = requests.get(BASE_URL, params=params, headers=headers)

if response.status_code == 200:
    data = response.json()
    total = data['results']['bindings'][0]['totalMovies']['value']
    print(f"Total de pelÃ­culas Ãºnicas en Wikidata: {total}")
else:
    print(f"Error: {response.status_code}")

Total de pelÃ­culas Ãºnicas en Wikidata: 361060


# Search movie descriptions (only a few lines)


In [None]:
import requests

TMDB_API_KEY = "5fc726095f165df90ed75e0a17e3e1c2"
BASE_URL = "https://api.themoviedb.org/3"

def get_top_movies(year, num_movies=5):
    """Get top N most popular movies from a year"""
    url = f"{BASE_URL}/discover/movie"
    params = {
        'api_key': TMDB_API_KEY,
        'primary_release_year': year,
        'sort_by': 'popularity.desc',
        'page': 1
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get('results', [])[:num_movies]
    return []

def get_movie_imdb_id(movie_id):
    """Get IMDb ID for a movie (link to Wikidata)"""
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': TMDB_API_KEY}

    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get('imdb_id')
    return None

def main(year, num_movies=5):
    print(f"\nTop {num_movies} movies from {year}:\n")
    print("="*80)

    movies = get_top_movies(year, num_movies)

    for i, movie in enumerate(movies, 1):
        imdb_id = get_movie_imdb_id(movie['id'])

        print(f"\n[{i}] {movie['title']}")
        print(f"    Description: {movie.get('overview', 'N/A')}")
        print(f"    IMDb ID (Wikidata link): {imdb_id}")
        print("-"*80)

if __name__ == "__main__":
    YEAR = 2020
    NUM_MOVIES = 100

    main(YEAR, NUM_MOVIES)


Top 5 movies from 2020:


[1] Demon Slayer -Kimetsu no Yaiba- The Movie: Mugen Train
    Description: Tanjiro Kamado, joined with Inosuke Hashibira, a boy raised by boars who wears a boar's head, and Zenitsu Agatsuma, a scared boy who reveals his true power when he sleeps, boards the Infinity Train on a new mission with the Fire Hashira, Kyojuro Rengoku, to defeat a demon who has been tormenting the people and killing the demon slayers who oppose it!
    IMDb ID (Wikidata link): tt11032374
--------------------------------------------------------------------------------

[2] Twittering Birds Never Fly: The Clouds Gather
    Description: In the hyper-masculine criminal underworld, a masochistic high-ranking yakuza and his newly-assigned bodyguard become increasingly drawn to each other.
    IMDb ID (Wikidata link): tt10675392
--------------------------------------------------------------------------------

[3] Fate/stay night: Heaven's Feel III. Spring Song
    Description: The battle r

# Search for movies and download csv

It makes an API call per year.

You can set max movies per year, but counts movies before grouping by title (there is a row for each genre a movie has), so it fetches ~1/3 of max movies per year.

The API calls are made asynchronously in batches to avoid limit errors.



In [5]:
import asyncio
import aiohttp
import csv
import json
import re

BASE_URL = "https://query.wikidata.org/sparql"
headers = {
    "User-Agent": "MoviesAnalysis/1.0 (martinsaski@gmail.com)",
    "Accept": "application/json"
}

async def get_movies_by_year(session, year, limit):
    query = f"""
    SELECT DISTINCT ?movie ?movieLabel ?releaseDate ?imdbID ?countryLabel ?duration ?genreLabel
                    ?directorLabel ?directorGenderLabel ?directorNationalityLabel
                    ?actorLabel ?actorGenderLabel ?actorNationalityLabel
                    ?awardLabel ?budget ?boxOffice ?setPeriodLabel
                    ?article
    WHERE {{
      ?movie wdt:P31/wdt:P279* wd:Q11424;
             wdt:P577 ?releaseDate.
      FILTER(YEAR(?releaseDate) = {year})

      # Wikipedia article
      OPTIONAL {{
        ?article schema:about ?movie;
                 schema:isPartOf <https://en.wikipedia.org/>.
      }}

      # Genre
      OPTIONAL {{ ?movie wdt:P136 ?genre. }}

      # Director con gÃ©nero y nacionalidad
      OPTIONAL {{
        ?movie wdt:P57 ?director.
        OPTIONAL {{ ?director wdt:P21 ?directorGender. }}
        OPTIONAL {{ ?director wdt:P27 ?directorNationality. }}
      }}

      # Actor/Actriz con gÃ©nero y nacionalidad
      OPTIONAL {{
        ?movie wdt:P161 ?actor.
        OPTIONAL {{ ?actor wdt:P21 ?actorGender. }}
        OPTIONAL {{ ?actor wdt:P27 ?actorNationality. }}
      }}

      # Nuevos campos
      OPTIONAL {{ ?movie wdt:P166 ?award. }}        # Award received
      OPTIONAL {{ ?movie wdt:P2130 ?budget. }}      # Budget
      OPTIONAL {{ ?movie wdt:P2142 ?boxOffice. }}   # Box office
      OPTIONAL {{ ?movie wdt:P2408 ?setPeriod. }}   # Set in period

      # Otros campos opcionales
      OPTIONAL {{ ?movie wdt:P2047 ?duration. }}
      OPTIONAL {{ ?movie wdt:P345 ?imdbID. }}
      OPTIONAL {{ ?movie wdt:P495 ?country. }}

      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en".
        ?movie rdfs:label ?movieLabel.
        ?genre rdfs:label ?genreLabel.
        ?director rdfs:label ?directorLabel.
        ?directorGender rdfs:label ?directorGenderLabel.
        ?directorNationality rdfs:label ?directorNationalityLabel.
        ?actor rdfs:label ?actorLabel.
        ?actorGender rdfs:label ?actorGenderLabel.
        ?actorNationality rdfs:label ?actorNationalityLabel.
        ?award rdfs:label ?awardLabel.
        ?setPeriod rdfs:label ?setPeriodLabel.
        ?country rdfs:label ?countryLabel.
      }}
    }}
    LIMIT {limit}
    """

    params = {'query': query, 'format': 'json'}

    try:
        timeout = aiohttp.ClientTimeout(total=60)  # 60 segundos por request
        async with session.get(BASE_URL, params=params, headers=headers, timeout=timeout) as response:
            if response.status == 200:
                # Lee como texto y limpia caracteres problemÃ¡ticos
                text = await response.text()

                # Elimina caracteres de control invÃ¡lidos
                text = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', text)

                try:
                    data = json.loads(text)
                except json.JSONDecodeError as e:
                    print(f"âœ— {year}: JSON decode error - {e}")
                    return []

                results = data['results']['bindings']

                movies_dict = {}

                for r in results:
                    movie_id = r.get('movie', {}).get('value')

                    if movie_id not in movies_dict:
                        movies_dict[movie_id] = {
                            'title': r.get('movieLabel', {}).get('value'),
                            'release_date': r.get('releaseDate', {}).get('value'),
                            'duration': r.get('duration', {}).get('value'),
                            'imdb_id': r.get('imdbID', {}).get('value'),
                            'country': r.get('countryLabel', {}).get('value'),
                            'year': year,
                            'wikipedia_url': r.get('article', {}).get('value', ''),
                            'budget': r.get('budget', {}).get('value'),
                            'box_office': r.get('boxOffice', {}).get('value'),
                            'genres': set(),
                            'directors': {},
                            'actors': {},
                            'awards': set(),
                            'set_periods': set()
                        }

                    # GÃ©nero
                    genre = r.get('genreLabel', {}).get('value')
                    if genre:
                        movies_dict[movie_id]['genres'].add(genre)

                    # Director con info adicional
                    director_name = r.get('directorLabel', {}).get('value')
                    if director_name:
                        director_gender = r.get('directorGenderLabel', {}).get('value', '')
                        director_nationality = r.get('directorNationalityLabel', {}).get('value', '')

                        movies_dict[movie_id]['directors'][director_name] = {
                            'gender': director_gender,
                            'nationality': director_nationality
                        }

                    # Actor con info adicional
                    actor_name = r.get('actorLabel', {}).get('value')
                    if actor_name:
                        actor_gender = r.get('actorGenderLabel', {}).get('value', '')
                        actor_nationality = r.get('actorNationalityLabel', {}).get('value', '')

                        movies_dict[movie_id]['actors'][actor_name] = {
                            'gender': actor_gender,
                            'nationality': actor_nationality
                        }

                    # Premios
                    award = r.get('awardLabel', {}).get('value')
                    if award:
                        movies_dict[movie_id]['awards'].add(award)

                    # PerÃ­odo de ambientaciÃ³n
                    set_period = r.get('setPeriodLabel', {}).get('value')
                    if set_period:
                        movies_dict[movie_id]['set_periods'].add(set_period)

                    # Budget y Box Office - tomamos el Ãºltimo valor si hay mÃºltiples
                    budget_val = r.get('budget', {}).get('value')
                    if budget_val and not movies_dict[movie_id]['budget']:
                        movies_dict[movie_id]['budget'] = budget_val

                    box_office_val = r.get('boxOffice', {}).get('value')
                    if box_office_val and not movies_dict[movie_id]['box_office']:
                        movies_dict[movie_id]['box_office'] = box_office_val

                # Convertir a lista
                movies = []
                for movie_data in movies_dict.values():
                    # Formatear directores
                    directors_list = [
                        f"{name} ({info['gender']}, {info['nationality']})"
                        if info['gender'] or info['nationality']
                        else name
                        for name, info in sorted(movie_data['directors'].items())
                    ]

                    # Formatear actores
                    actors_list = [
                        f"{name} ({info['gender']}, {info['nationality']})"
                        if info['gender'] or info['nationality']
                        else name
                        for name, info in sorted(movie_data['actors'].items())
                    ]

                    movies.append({
                        'title': movie_data['title'],
                        'release_date': movie_data['release_date'],
                        'genre': ', '.join(sorted(movie_data['genres'])) if movie_data['genres'] else '',
                        'director': ', '.join(directors_list),
                        'actors': ', '.join(actors_list),
                        'duration': movie_data['duration'],
                        'imdb_id': movie_data['imdb_id'],
                        'country': movie_data['country'],
                        'budget': movie_data['budget'],
                        'box_office': movie_data['box_office'],
                        'awards': ', '.join(sorted(movie_data['awards'])) if movie_data['awards'] else '',
                        'set_in_period': ', '.join(sorted(movie_data['set_periods'])) if movie_data['set_periods'] else '',
                        'year': movie_data['year'],
                        'wikipedia_url': movie_data['wikipedia_url']
                    })

                print(f"âœ“ {year}: {len(movies)} movies")
                return movies
            else:
                print(f"âœ— {year}: HTTP {response.status}")

    except asyncio.TimeoutError:
        print(f"âœ— {year}: Timeout")
    except Exception as e:
        print(f"âœ— {year}: Error - {e}")

    return []

async def fetch_all_years(movies_per_year, start_year, end_year, batch_size=50):
    """Procesa aÃ±os en lotes para evitar sobrecarga"""
    all_movies = []

    async with aiohttp.ClientSession() as session:
        years = list(range(start_year, end_year + 1))
        total_years = len(years)

        # Procesar en lotes
        for i in range(0, total_years, batch_size):
            batch_years = years[i:i + batch_size]
            print(f"\nðŸ“¦ Processing batch: {batch_years[0]}-{batch_years[-1]}")

            tasks = [get_movies_by_year(session, year, movies_per_year) for year in batch_years]
            results = await asyncio.gather(*tasks)

            batch_movies = [movie for year_movies in results for movie in year_movies]
            all_movies.extend(batch_movies)

            print(f"   Batch total: {len(batch_movies)} movies")

            # PequeÃ±a pausa entre lotes
            if i + batch_size < total_years:
                await asyncio.sleep(2)

    return all_movies

async def main(movies_per_year, start_year, end_year):
    print(f"Fetching {movies_per_year} movies per year ({start_year}-{end_year})...\n")
    print(f"Total years to process: {end_year - start_year + 1}\n")

    all_movies = await fetch_all_years(movies_per_year, start_year, end_year, batch_size=10)

    filename = 'wikidata_movies.csv'
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'release_date', 'genre', 'director',
                                                 'actors', 'duration', 'imdb_id', 'country',
                                                 'budget', 'box_office', 'awards', 'set_in_period',
                                                 'year', 'wikipedia_url'])
        writer.writeheader()
        writer.writerows(all_movies)

    print(f"\nâœ… Saved {len(all_movies)} movies to {filename}")

    try:
        from google.colab import files
        files.download(filename)
    except:
        pass

if __name__ == "__main__":
    await main(movies_per_year=10, start_year=1900, end_year=2026)

Fetching 10 movies per year (1900-2026)...

Total years to process: 127


ðŸ“¦ Processing batch: 1900-1909
âœ“ 1905: 9 movies
âœ“ 1901: 9 movies
âœ“ 1900: 9 movies
âœ“ 1908: 10 movies
âœ“ 1903: 8 movies
âœ“ 1904: 9 movies
âœ“ 1902: 10 movies
âœ“ 1907: 10 movies
âœ“ 1906: 9 movies
âœ“ 1909: 9 movies
   Batch total: 92 movies

ðŸ“¦ Processing batch: 1910-1919
âœ“ 1915: 1 movies
âœ“ 1913: 1 movies
âœ“ 1918: 4 movies
âœ“ 1912: 4 movies
âœ“ 1917: 4 movies
âœ“ 1916: 5 movies
âœ“ 1910: 8 movies
âœ“ 1919: 2 movies
âœ“ 1914: 2 movies
âœ“ 1911: 2 movies
   Batch total: 33 movies

ðŸ“¦ Processing batch: 1920-1929
âœ“ 1927: 1 movies
âœ“ 1923: 2 movies
âœ“ 1929: 10 movies
âœ“ 1922: 1 movies
âœ“ 1921: 5 movies
âœ“ 1924: 1 movies
âœ“ 1926: 3 movies
âœ“ 1920: 2 movies
âœ“ 1928: 9 movies
âœ“ 1925: 1 movies
   Batch total: 35 movies

ðŸ“¦ Processing batch: 1930-1939
âœ“ 1931: 10 movies
âœ“ 1930: 5 movies
âœ“ 1939: 5 movies
âœ“ 1933: 9 movies
âœ“ 1935: 9 movies
âœ“ 1936: 9 movies
âœ“ 1938: 1 movies
âœ“ 1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>