In [1]:
import pandas as pd
import setup_django

setup_django.main()

In [2]:
from ratings.models import Rating

qs = Rating.objects.all()

missing_movies_ids = []
for instance in qs:
    if instance.content_object is None:
        missing_movies_ids.append(instance.object_id)

total = len(missing_movies_ids)
missing = list(set(missing_movies_ids))
len(missing), total

(6392, 57175)

In [3]:
from django.conf import settings

links_path = settings.BASE_DIR / "data" / "links_small.csv"
assert links_path.exists()

In [4]:
links_df = pd.read_csv(links_path)
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
missing_df = links_df.copy()[links_df['movieId'].isin(missing)]
missing_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
3,4,114885,31357.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0


In [7]:
assert missing_df.shape[0] == len(missing)

In [8]:
def enrich_imdb_column(value: int):
    value = str(value)
    if len(value) == 5:
        value = f"tt00{value}"
    if len(value) == 6:
        value = f"tt0{value}"
    if len(value) == 7:
        value = f"tt{value}"
    return value

missing_df['tt'] = missing_df['imdbId'].apply(enrich_imdb_column)

missing_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt
0,1,114709,862.0,tt0114709
3,4,114885,31357.0,tt0114885
6,7,114319,11860.0,tt0114319
7,8,112302,45325.0,tt0112302
8,9,114576,9091.0,tt0114576


In [9]:
movies_path = settings.BASE_DIR / "data" / "movies_metadata.csv"
assert movies_path.exists()

In [10]:
movies_df = pd.read_csv(movies_path, usecols=['title', 'overview', 'release_date', 'imdb_id'])
movies_df.head(n=10)

Unnamed: 0,imdb_id,overview,release_date,title
0,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story
1,tt0113497,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji
2,tt0113228,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men
3,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale
4,tt0113041,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II
5,tt0113277,"Obsessive master thief, Neil McCauley leads a ...",1995-12-15,Heat
6,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina
7,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck
8,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death
9,tt0113189,James Bond must unmask the mysterious head of ...,1995-11-16,GoldenEye


In [11]:
missing_movies_df = missing_df.merge(movies_df, left_on='tt', right_on='imdb_id')
missing_movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt,imdb_id,overview,release_date,title
0,1,114709,862.0,tt0114709,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story
1,4,114885,31357.0,tt0114885,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale
2,7,114319,11860.0,tt0114319,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina
3,8,112302,45325.0,tt0112302,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck
4,9,114576,9091.0,tt0114576,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death


In [12]:
missing_movies_df['id'] = missing_movies_df['movieId']
missing_movies_df['id_alt'] = missing_movies_df['tmdbId'].apply(int)
missing_movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt,imdb_id,overview,release_date,title,id,id_alt
0,1,114709,862.0,tt0114709,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,1,862
1,4,114885,31357.0,tt0114885,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,4,31357
2,7,114319,11860.0,tt0114319,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina,7,11860
3,8,112302,45325.0,tt0112302,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck,8,45325
4,9,114576,9091.0,tt0114576,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death,9,9091


In [28]:
final_df = missing_movies_df.copy()[['id', 'id_alt', 'title']]
final_df.head()

Unnamed: 0,id,id_alt,title
0,1,862,Toy Story
1,4,31357,Waiting to Exhale
2,7,11860,Sabrina
3,8,45325,Tom and Huck
4,9,9091,Sudden Death


In [17]:
altered_movies_ids = final_df['id_alt'].tolist()
len(altered_movies_ids)

6349

In [20]:
from movies.models import Movie

qs = Movie.objects.filter(id__in=altered_movies_ids)
qs.count()

6035

In [30]:
from django.forms.models import model_to_dict

updated_movies = []
for instance in qs:
    data = final_df.copy()[final_df['id_alt'] == instance.id]
    movie_data_is_altered = data.shape[0] == 1
    if (movie_data_is_altered):
        original_data = model_to_dict(instance)
        correct_data = data.to_dict('records')[0]
        if (instance.title == correct_data.get('title', None)):
            new_data = {**original_data, 'id': correct_data['id']}
            instance.delete()
            updated_movies.append(Movie.objects.create(**new_data))

len(updated_movies)

6030

In [32]:
from movies.tasks import update_movie_ratings_outdated

update_movie_ratings_outdated.apply()

<EagerResult: a555c055-6bbf-4169-ad79-0d19dfb1eb0a>