In [1]:
import setup_django
setup_django.init()

In [2]:
import pandas as pd
from django.conf import settings
from movies.models import Movie
from ratings.models import Rating


LINKS_SMALL_CSV = settings.DATA_DIR / 'links_small.csv'
LINKS_SMALL_CSV.exists()

True

In [3]:
qs = Rating.objects.all()
missing_movie_ids = []
for instance in qs:
    if instance.content_object is None:
        missing_movie_ids.append(instance.object_id)

        
_total = len(missing_movie_ids)
total_missing = list(set(missing_movie_ids))

In [4]:
print(len(total_missing), _total, qs.count())

6392 57175 100008


In [5]:
links_df = pd.read_csv(LINKS_SMALL_CSV)
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
ms_df = links_df.copy()[links_df.movieId.isin(total_missing)]
ms_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
3,4,114885,31357.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0


In [7]:
ms_df.shape[0] == len(total_missing)

True

In [8]:
def enrich_imdb_col(val):
    val = str(val)
    if len(val) == 7:
        val = f"tt{val}"
        return val
    if len(val) == 6:
        val = f"tt0{val}"
        return val
    if len(val) == 5:
        val = f"tt00{val}"
        return val
    return val

In [9]:
ms_df['tt'] = ms_df['imdbId'].apply(enrich_imdb_col)
# ms_df.to_csv('')
ms_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt
0,1,114709,862.0,tt0114709
3,4,114885,31357.0,tt0114885
6,7,114319,11860.0,tt0114319
7,8,112302,45325.0,tt0112302
8,9,114576,9091.0,tt0114576


In [10]:
MOVIES_CSV = settings.DATA_DIR / 'movies_metadata.csv'
MOVIES_CSV.exists()

True

In [11]:
movies_cols = ['title', 'overview', 'release_date', 'imdb_id']
movies_df = pd.read_csv(MOVIES_CSV, usecols=movies_cols)
movies_df.head()

Unnamed: 0,imdb_id,overview,release_date,title
0,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story
1,tt0113497,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji
2,tt0113228,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men
3,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale
4,tt0113041,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II


In [12]:
missing_movies_df = ms_df.merge(movies_df, left_on='tt', right_on='imdb_id')
missing_movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt,imdb_id,overview,release_date,title
0,1,114709,862.0,tt0114709,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story
1,4,114885,31357.0,tt0114885,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale
2,7,114319,11860.0,tt0114319,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina
3,8,112302,45325.0,tt0112302,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck
4,9,114576,9091.0,tt0114576,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death


In [13]:
missing_movies_df['id'] = missing_movies_df['movieId']
missing_movies_df['id_alt'] = missing_movies_df['tmdbId'].apply(lambda x: str(int(x)))
missing_movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt,imdb_id,overview,release_date,title,id,id_alt
0,1,114709,862.0,tt0114709,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,1,862
1,4,114885,31357.0,tt0114885,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,4,31357
2,7,114319,11860.0,tt0114319,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina,7,11860
3,8,112302,45325.0,tt0112302,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck,8,45325
4,9,114576,9091.0,tt0114576,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death,9,9091


In [14]:
final_df = missing_movies_df.copy()[['id', 'id_alt', 'title']]
final_df['id_alt'] = final_df['id_alt'].astype(str)
final_df.head()

Unnamed: 0,id,id_alt,title
0,1,862,Toy Story
1,4,31357,Waiting to Exhale
2,7,11860,Sabrina
3,8,45325,Tom and Huck
4,9,9091,Sudden Death


In [17]:
alt_id_list = final_df['id_alt'].to_list()

In [20]:
chunk_size = 100  # Adjust this value based on your needs
total_ids = len(alt_id_list)

for i in range(0, total_ids, chunk_size):
    chunk_ids = alt_id_list[i:i+chunk_size]
    movies_qs = Movie.objects.filter(id__in=chunk_ids)
    count = movies_qs.count()
    # Process the results or do whatever you need with the count

In [22]:
from django.forms.models import model_to_dict

In [25]:
chunk_size = 100  # Adjust this value based on your needs
total_ids = len(alt_id_list)

for i in range(0, total_ids, chunk_size):
    chunk_ids = alt_id_list[i:i + chunk_size]
    movies_qs = Movie.objects.filter(id__in=chunk_ids)

    for obj in movies_qs:
        data = final_df[final_df['id_alt'] == str(obj.id)]
        
        if data.shape[0] == 1 and obj.title == data.iloc[0]['title']:
            og_model_data = model_to_dict(obj)
            # print(og_model_data)
            og_model_data['tmdb_id'] = og_model_data['id']
            og_model_data['id'] = data.iloc[0]['id']
            new_model_data = {**og_model_data}
            # print(new_model_data)
            obj.delete()
            # Create a new object with the updated data
            Movie.objects.create(**og_model_data)

In [None]:
# movies_qs = Movie.objects.filter(id__in=alt_id_list)
# for obj in movies_qs:
#     data = final_df.copy()[final_df['id_alt'] == str(obj.id)]
#     if data.shape[0] == 1:
#         og_model_data = model_to_dict(obj)
#         update_data = data.to_dict('records')[0]
#         if obj.title == update_data.get('title'): 
#             print(og_model_data)
#             og_model_data['id'] = update_data['id']
#             new_model_data = {**og_model_data}
#             print(new_model_data)
            # obj.delete()
            # Movie.objects.create(**new_model_data)

In [26]:
from ratings.tasks import task_update_movie_ratings
task_update_movie_ratings()

Rating update took 0:11:10 (670.3056628704071s)


In [5]:
movies_without_tmdb_id = Movie.objects.filter(tmdb_id=None)

for movie in movies_without_tmdb_id:
    print(movie.title, movie.id, movie.tmdb_id)
    movie.tmdb_id = movie.id
    movie.save()

Shadows in Paradise 3 None
Four Rooms 5 None
Star Wars 11 None
The Dark 17 None
Metropolis 19 None
9 Songs 27 None
Magnetic Rose 30 None
Unforgiven 33 None
The Simpsons Movie 35 None
Amores perros 55 None
Pirates of the Caribbean: Dead Man's Chest 58 None
2001: A Space Odyssey 62 None
Walk the Line 69 None
Million Dollar Baby 70 None
Mars Attacks! 75 None
Before Sunrise 76 None
Memento 77 None
Blade Runner 78 None
The Elementary Particles 86 None
Indiana Jones and the Temple of Doom 87 None
Dirty Dancing 88 None
Land Without Bread 91 None
Megacities 92 None
Armageddon 95 None
Tron 97 None
All About My Mother 99 None
Leon: The Professional 101 None
Open Hearts 102 None
Taxi Driver 103 None
Run Lola Run 104 None
Predator 106 None
Snatch 107 None
Three Colors: Blue 108 None
Three Colors: White 109 None
Three Colors: Red 110 None
Italian for Beginners 112 None
Spring, Summer, Fall, Winter... and Spring 113 None
Pretty Woman 114 None
Charlie and the Chocolate Factory 118 None
The Lord of th