In [1]:
import pandas as pd
import requests

from typing import Optional

In [2]:
links_path = 'dataset/ml-latest-small/links.csv'
dtype = {"movieId": str, "imdbId": str, "tmdbId": str}
links_df = pd.read_csv(links_path, dtype=dtype)

### Getting the wikidata uri for every movie in the dataset

In [3]:
def get_uri(imdb_id: str) -> Optional[str]:
    url = 'https://query.wikidata.org/sparql'
    query = f'''
    select distinct ?movie where {{
    ?movie wdt:P345 ?imdb_id;
            wdt:P136 ?genre.
    FILTER (?imdb_id = "tt{imdb_id}")
    }}
    '''
    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    results = data['results']['bindings']
    return results[0]['movie']['value'] if len(results) > 0 else None

In [None]:
links_df['wikidataURI'] = links_df.apply(lambda x: get_uri(x['imdbId']), axis=1) # type: ignore

Very slow. Better get all movies that have imdb_id from wikidata and then take the ones that are in the dataset.

In [4]:
imdb_movie_dict: dict[str, str] = {}

url = 'https://query.wikidata.org/sparql'
query = '''
SELECT DISTINCT ?imdb_id ?movie WHERE {
  ?movie (wdt:P31/(wdt:P279*)) wd:Q11424;
    wdt:P345 ?imdb_id.
}
'''
r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()
results = data['results']['bindings']

for result in results:
    imdb_id = result['imdb_id']['value']

    imdb_id = imdb_id[2:] if imdb_id[:2].isalpha() else imdb_id
    imdb_movie_dict[imdb_id] = result['movie']['value'][31:] # skip 'http://www.wikidata.org/entity/'

In [5]:
links_df['wikidataURI'] = links_df.apply(lambda x: imdb_movie_dict.get(x['imdbId'], None), axis=1) # type: ignore

In [8]:
links_df.head(100)
links_df.to_csv('dataset/ml-latest-small/links_with_wikidata_uri.csv')

Get different attributes of the movie