In [2]:
import asyncio
import aiohttp
import csv

BASE_URL = "https://query.wikidata.org/sparql"
headers = {
    "User-Agent": "MoviesAnalysis/1.0 (martinsaski@gmail.com)",
    "Accept": "application/json"
}

async def get_movies_by_year(session, year, limit):
    query = f"""
    SELECT ?movie ?movieLabel ?plotSummary ?releaseDate ?imdbID ?countryLabel ?duration ?sitelinks
           (GROUP_CONCAT(DISTINCT ?genreLabel; separator=", ") AS ?genres)
           (GROUP_CONCAT(DISTINCT ?directorLabel; separator=", ") AS ?directors)
    WHERE {{
      ?movie wdt:P31/wdt:P279* wd:Q11424;
             wdt:P577 ?releaseDate;
             wdt:P2437 ?plotSummary;
             wikibase:sitelinks ?sitelinks.
      FILTER(YEAR(?releaseDate) = {year})
      OPTIONAL {{ ?movie wdt:P136 ?genre. }}
      OPTIONAL {{ ?movie wdt:P57 ?director. }}
      OPTIONAL {{ ?movie wdt:P2047 ?duration. }}
      OPTIONAL {{ ?movie wdt:P345 ?imdbID. }}
      OPTIONAL {{ ?movie wdt:P495 ?country. }}
      SERVICE wikibase:label {{
        bd:serviceParam wikibase:language "en".
        ?genre rdfs:label ?genreLabel.
        ?director rdfs:label ?directorLabel.
      }}
    }}
    GROUP BY ?movie ?movieLabel ?plotSummary ?releaseDate ?imdbID ?countryLabel ?duration ?sitelinks
    ORDER BY DESC(?sitelinks)
    LIMIT {limit}
    """

    params = {'query': query, 'format': 'json'}

    try:
        async with session.get(BASE_URL, params=params, headers=headers) as response:
            if response.status == 200:
                data = await response.json()
                results = data['results']['bindings']
                movies = []
                seen_titles = set()

                for r in results:
                    title = r.get('movieLabel', {}).get('value')
                    if title in seen_titles:
                        continue
                    seen_titles.add(title)

                    movies.append({
                        'title': title,
                        'summary': r.get('plotSummary', {}).get('value'),
                        'release_date': r.get('releaseDate', {}).get('value'),
                        'genre': r.get('genres', {}).get('value'),
                        'director': r.get('directors', {}).get('value'),
                        'duration': r.get('duration', {}).get('value'),
                        'imdb_id': r.get('imdbID', {}).get('value'),
                        'country': r.get('countryLabel', {}).get('value'),
                        'sitelinks': r.get('sitelinks', {}).get('value'),
                        'year': year
                    })
                print(f"✓ {year}: {len(movies)} most popular movies")
                return movies
    except Exception as e:
        print(f"✗ {year}: Error - {e}")

    return []

async def fetch_all_years(movies_per_year, start_year, end_year):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for year in range(start_year, end_year + 1):
            tasks.append(get_movies_by_year(session, year, movies_per_year))

        results = await asyncio.gather(*tasks)
        all_movies = [movie for year_movies in results for movie in year_movies]
        return all_movies

async def main(movies_per_year, start_year=2000, end_year=2024):
    print(f"Fetching top {movies_per_year} most popular movies per year ({start_year}-{end_year})...\n")

    all_movies = await fetch_all_years(movies_per_year, start_year, end_year)

    filename = 'wikidata_movies.csv'
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'summary', 'release_date', 'genre', 'director',
                                                 'duration', 'imdb_id', 'country', 'sitelinks', 'year'])
        writer.writeheader()
        writer.writerows(all_movies)

    print(f"\nSaved {len(all_movies)} movies to {filename}")
    print(f"Movies ordered by popularity (sitelinks) per year")

    try:
        from google.colab import files
        files.download(filename)
    except:
        pass

if __name__ == "__main__":
    await main(movies_per_year=50, start_year=1950, end_year=2024)

Fetching top 50 most popular movies per year (1950-2024)...

✓ 1972: 1 most popular movies
✓ 1987: 1 most popular movies
✓ 2007: 1 most popular movies
✓ 2015: 1 most popular movies
✓ 1974: 1 most popular movies
✓ 2002: 1 most popular movies
✓ 2010: 1 most popular movies
✓ 2006: 1 most popular movies
✓ 2005: 1 most popular movies
✓ 1996: 1 most popular movies
✓ 1961: 0 most popular movies
✓ 2011: 1 most popular movies
✓ 1950: 0 most popular movies
✓ 1983: 1 most popular movies
✓ 2014: 1 most popular movies
✓ 1975: 1 most popular movies
✓ 1981: 1 most popular movies
✓ 1979: 1 most popular movies
✓ 1966: 0 most popular movies
✓ 1988: 1 most popular movies
✓ 1957: 1 most popular movies
✓ 2013: 1 most popular movies
✓ 1955: 0 most popular movies
✓ 1951: 0 most popular movies
✓ 1959: 1 most popular movies
✓ 1958: 1 most popular movies
✓ 1964: 1 most popular movies
✓ 1995: 1 most popular movies
✓ 2009: 1 most popular movies
✓ 1999: 1 most popular movies
✓ 1980: 1 most popular movies
✓ 1978: 0

In [3]:
import requests

TMDB_API_KEY = "5fc726095f165df90ed75e0a17e3e1c2"
BASE_URL = "https://api.themoviedb.org/3"

def get_top_movies(year, num_movies=5):
    """Get top N most popular movies from a year"""
    url = f"{BASE_URL}/discover/movie"
    params = {
        'api_key': TMDB_API_KEY,
        'primary_release_year': year,
        'sort_by': 'popularity.desc',
        'page': 1
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get('results', [])[:num_movies]
    return []

def get_movie_imdb_id(movie_id):
    """Get IMDb ID for a movie (link to Wikidata)"""
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {'api_key': TMDB_API_KEY}

    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json().get('imdb_id')
    return None

def main(year, num_movies=5):
    print(f"\nTop {num_movies} movies from {year}:\n")
    print("="*80)

    movies = get_top_movies(year, num_movies)

    for i, movie in enumerate(movies, 1):
        imdb_id = get_movie_imdb_id(movie['id'])

        print(f"\n[{i}] {movie['title']}")
        print(f"    Description: {movie.get('overview', 'N/A')}")
        print(f"    IMDb ID (Wikidata link): {imdb_id}")
        print("-"*80)

if __name__ == "__main__":
    YEAR = 2020
    NUM_MOVIES = 5

    main(YEAR, NUM_MOVIES)


Top 5 movies from 2020:


[1] Demon Slayer -Kimetsu no Yaiba- The Movie: Mugen Train
    Description: Tanjiro Kamado, joined with Inosuke Hashibira, a boy raised by boars who wears a boar's head, and Zenitsu Agatsuma, a scared boy who reveals his true power when he sleeps, boards the Infinity Train on a new mission with the Fire Hashira, Kyojuro Rengoku, to defeat a demon who has been tormenting the people and killing the demon slayers who oppose it!
    IMDb ID (Wikidata link): tt11032374
--------------------------------------------------------------------------------

[2] Josee, the Tiger and the Fish
    Description: With dreams of diving abroad, Tsuneo gets a job assisting Josee, an artist whose imagination takes her far beyond her wheelchair. But when the tide turns against them, they push each other to places they never thought possible, and inspire a love fit for a storybook.
    IMDb ID (Wikidata link): tt12879624
---------------------------------------------------------------

In [4]:
import requests
import csv
import re

TMDB_API_KEY = "5fc726095f165df90ed75e0a17e3e1c2"
TMDB_URL = "https://api.themoviedb.org/3"
WIKI_URL = "https://en.wikipedia.org/w/api.php"

def get_top_movies_by_year(year, num_movies=5):
    """Get top N most popular movies from TMDb for a specific year"""
    url = f"{TMDB_URL}/discover/movie"
    params = {
        'api_key': TMDB_API_KEY,
        'primary_release_year': year,
        'sort_by': 'popularity.desc',
        'page': 1
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        movies = data.get('results', [])[:num_movies]

        # Get IMDb ID for each
        detailed_movies = []
        for movie in movies:
            movie_id = movie['id']
            details = requests.get(f"{TMDB_URL}/movie/{movie_id}", params={'api_key': TMDB_API_KEY}).json()
            detailed_movies.append({
                'title': movie['title'],
                'imdb_id': details.get('imdb_id'),
                'year': year
            })
        return detailed_movies
    return []

def get_wikipedia_plot(title, year):
    """Extract plot section from Wikipedia"""
    # Search for the page
    search_params = {
        'action': 'query',
        'list': 'search',
        'srsearch': f'{title} {year} film',
        'format': 'json'
    }

    response = requests.get(WIKI_URL, params=search_params)
    if response.status_code != 200:
        return None

    search_results = response.json().get('query', {}).get('search', [])
    if not search_results:
        return None

    page_title = search_results[0]['title']

    # Get sections
    sections_params = {
        'action': 'parse',
        'page': page_title,
        'prop': 'sections',
        'format': 'json'
    }

    response = requests.get(WIKI_URL, params=sections_params)
    if response.status_code != 200:
        return None

    parse_data = response.json().get('parse', {})
    sections = parse_data.get('sections', [])

    # Find Plot section
    plot_section = None
    for section in sections:
        if 'plot' in section.get('line', '').lower():
            plot_section = section.get('index')
            break

    if not plot_section:
        return None

    # Get Plot section content
    content_params = {
        'action': 'parse',
        'page': page_title,
        'prop': 'text',
        'section': plot_section,
        'format': 'json'
    }

    response = requests.get(WIKI_URL, params=content_params)
    if response.status_code != 200:
        return None

    html_content = response.json().get('parse', {}).get('text', {}).get('*', '')

    # Strip HTML tags
    text = re.sub('<.*?>', '', html_content)
    text = text.replace('\n', ' ').strip()

    return text if text else None

def main(start_year, end_year, movies_per_year=5):
    print(f"Fetching top {movies_per_year} movies per year ({start_year}-{end_year})...\n")

    all_movies = []
    seen_titles = set()

    for year in range(start_year, end_year + 1):
        print(f"Processing year {year}...")
        movies = get_top_movies_by_year(year, movies_per_year)

        for movie in movies:
            title = movie['title']

            # Skip duplicates
            if title in seen_titles:
                continue

            print(f"  - Fetching plot for: {title}")
            plot = get_wikipedia_plot(title, movie['year'])

            if plot:
                seen_titles.add(title)
                all_movies.append({
                    'title': title,
                    'plot': plot,
                    'year': movie['year']
                })
                print(f"    ✓ Plot found ({len(plot)} chars)")
            else:
                print(f"    ✗ No plot found")

    # Save to CSV
    filename = 'movies_with_wikipedia_plots.csv'
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'plot', 'year'])
        writer.writeheader()
        writer.writerows(all_movies)

    print(f"\n✅ Saved {len(all_movies)} unique movies to {filename}")

    # Try download in Colab
    try:
        from google.colab import files
        files.download(filename)
    except:
        pass

if __name__ == "__main__":
    START_YEAR = 2020
    END_YEAR = 2022
    MOVIES_PER_YEAR = 5

    main(START_YEAR, END_YEAR, MOVIES_PER_YEAR)

Fetching top 5 movies per year (2020-2022)...

Processing year 2020...
  - Fetching plot for: Demon Slayer -Kimetsu no Yaiba- The Movie: Mugen Train
    ✗ No plot found
  - Fetching plot for: Josee, the Tiger and the Fish
    ✗ No plot found
  - Fetching plot for: Twittering Birds Never Fly: The Clouds Gather
    ✗ No plot found
  - Fetching plot for: Breach
    ✗ No plot found
  - Fetching plot for: Unhinged
    ✗ No plot found
Processing year 2021...
  - Fetching plot for: Jujutsu Kaisen 0
    ✗ No plot found
  - Fetching plot for: Spider-Man: No Way Home
    ✗ No plot found
  - Fetching plot for: No Time to Die
    ✗ No plot found
  - Fetching plot for: Wrath of Man
    ✗ No plot found
  - Fetching plot for: Dune
    ✗ No plot found
Processing year 2022...
  - Fetching plot for: The Black Phone
    ✗ No plot found
  - Fetching plot for: Hotel Dunsmuir
    ✗ No plot found
  - Fetching plot for: Avatar: The Way of Water
    ✗ No plot found
  - Fetching plot for: 365 Days: This Day
   