In [22]:
import pandas as pd
from tqdm import tqdm
from tmdbv3api import Collection

import sys
sys.path.insert(0, '../data_prep')
from sqlite_utils import select_statement_to_df, get_from_table

In [23]:
all_collections_statement = """

SELECT COLLECTION_ID, COLLECTION_NAME, COUNT(*) AS FILM_COUNT
FROM FILM_COLLECTIONS
WHERE COLLECTION_ID != -1
GROUP BY COLLECTION_ID

"""

In [24]:
all_collections_df = select_statement_to_df(all_collections_statement)
all_collections_df

Unnamed: 0,COLLECTION_ID,COLLECTION_NAME,FILM_COUNT
0,10,Star Wars Collection,9
1,84,Indiana Jones Collection,5
2,119,The Lord of the Rings Collection,3
3,131,Three Colors Collection,3
4,151,Star Trek: The Original Series Collection,6
...,...,...,...
1063,1130274,Salem’s Lot Series,1
1064,1131062,Wong Kar-Wai’s Love Trilogy,3
1065,1135770,Showa collection,1
1066,1136060,Evil Dead reboot collection,1


In [25]:
def check_if_tmdb_id_ingested(tmdb_id):
    film_ingested = 0
    if len(select_statement_to_df('SELECT FILM_ID FROM TMDB_ID WHERE TMDB_ID = {}'.format(tmdb_id)).values) == 1:
        film_ingested = 1
    return film_ingested

In [26]:
def get_missing_films(collection_id):
    collection = Collection()
    collection_details = collection.details(collection_id)
    tmp_df = pd.DataFrame([{'TMDB_ID': x.id, 'FILM_TITLE': x.title, 'FILM_YEAR': x['release_date'][:4], 'COLLECTION_ID': collection_id} for x in collection_details['parts']])
    tmp_df['MISSING'] = 1 - tmp_df['TMDB_ID'].apply(check_if_tmdb_id_ingested)
    return tmp_df

In [27]:
all_collection_films = []
bad_collection_ids = []
for collection_id in tqdm(all_collections_df['COLLECTION_ID']):
    try:
        all_collection_films.append(get_missing_films(collection_id))
    except:
        bad_collection_ids.append(collection_id)

  0%|          | 0/1068 [00:00<?, ?it/s]

100%|██████████| 1068/1068 [03:17<00:00,  5.41it/s] 


In [28]:
bad_collection_ids

[101471,
 101688,
 102322,
 102452,
 102777,
 102782,
 103190,
 103291,
 103372,
 103573,
 103577,
 104716,
 104830,
 105075,
 105324,
 105625,
 105995,
 106000,
 106498,
 106768,
 107674,
 107725,
 107949,
 108043,
 108125,
 108170,
 108693,
 108804,
 109076,
 109609,
 110021,
 110127,
 111751,
 112399,
 112636,
 113589,
 114783,
 114915,
 115142,
 115225,
 115570,
 115575,
 115762,
 115776,
 115822,
 115838,
 116661,
 116669,
 116847,
 117693,
 117927,
 117952,
 118221,
 119240,
 119674,
 120794,
 121195,
 121938,
 122017,
 122922,
 122952,
 123203,
 123213,
 123218,
 123249,
 123255,
 123256,
 123717,
 123720,
 123724,
 123726,
 123800,
 123932,
 124188,
 124492,
 124881,
 124901,
 124916,
 124930,
 124934,
 124935,
 124949,
 124950,
 124951,
 125570,
 125574,
 126125,
 126209,
 126220,
 126221,
 126580,
 131292,
 131295,
 131296,
 131635,
 131780,
 133352,
 133830,
 133923,
 134897,
 135179,
 135416,
 135466,
 135468,
 135483,
 135489,
 135495,
 135498,
 135501,
 136214,
 136218,
 

In [29]:
all_collection_films_df = pd.concat(all_collection_films).reset_index(drop=True)
all_collection_films_df

Unnamed: 0,TMDB_ID,FILM_TITLE,FILM_YEAR,COLLECTION_ID,MISSING
0,11,Star Wars,1977,10,0
1,1891,The Empire Strikes Back,1980,10,0
2,1892,Return of the Jedi,1983,10,0
3,1893,Star Wars: Episode I - The Phantom Menace,1999,10,0
4,1894,Star Wars: Episode II - Attack of the Clones,2002,10,0
...,...,...,...,...,...
1104,13654,101 Dalmatians II: Patch's London Adventure,2002,100693,0
1105,10865,Atlantis: The Lost Empire,2001,100965,0
1106,8965,Atlantis: Milo's Return,2003,100965,0
1107,10948,The Fox and the Hound,1981,100970,0


In [30]:
MISSING = all_collection_films_df[all_collection_films_df['MISSING'] == 1].reset_index(drop=True)

In [31]:
len(MISSING)

70

In [32]:
MISSING.groupby('COLLECTION_ID').agg({'FILM_TITLE':'count'}).reset_index().sort_values('FILM_TITLE', ascending=False).head(24)

Unnamed: 0,COLLECTION_ID,FILM_TITLE
14,55428,8
2,12087,5
7,33381,5
9,47814,5
29,96676,3
25,89188,3
10,48188,3
1,8783,3
0,8647,2
32,98435,2


In [33]:
all_collection_films_df[all_collection_films_df['COLLECTION_ID']==55428]

Unnamed: 0,TMDB_ID,FILM_TITLE,FILM_YEAR,COLLECTION_ID,MISSING
730,9975,Curious George,2006,55428,0
731,23903,Curious George 2: Follow That Monkey!,2009,55428,1
732,338103,Curious George 3: Back to the Jungle,2015,55428,1
733,270029,Curious George Swings Into Spring,2013,55428,1
734,263945,Curious George: A Very Monkey Christmas,2009,55428,1
735,256051,Curious George: A Halloween Boo Fest,2013,55428,1
736,627093,Curious George: Royal Monkey,2019,55428,1
737,743904,"Curious George: Go West, Go Wild",2020,55428,1
738,918832,Curious George: Cape Ahoy,2021,55428,1


In [None]:
MISSING[MISSING['COLLECTION_ID']==1102946]

Unnamed: 0,TMDB_ID,FILM_TITLE,FILM_YEAR,COLLECTION_ID,MISSING
807,81980,2001 Arizona Diamondbacks: The Official World ...,2001,1102946,1
808,470108,2002 Anaheim Angels: The Official World Series...,2002,1102946,1
809,174849,2003 Florida Marlins: The Official World Serie...,2003,1102946,1
810,58017,2004 Boston Red Sox: The Official World Series...,2004,1102946,1
811,217659,2005 Chicago White Sox: The Official World Ser...,2005,1102946,1
812,715330,2006 St. Louis Cardinals: The Official World S...,2006,1102946,1
813,58021,2007 Boston Red Sox: The Official World Series...,2007,1102946,1
814,61969,2008 Philadelphia Phillies: The Official World...,2008,1102946,1
815,228548,2009 New York Yankees: The Official World Seri...,2009,1102946,1
816,79879,2010 San Francisco Giants: The Official World ...,2010,1102946,1


In [None]:
round = -1
increment = 50

In [None]:
round += 1
MISSING.head(increment*(round+1)).tail(increment)

Unnamed: 0,TMDB_ID,FILM_TITLE,FILM_YEAR,COLLECTION_ID,MISSING
0,8460,The Skulls II,2002.0,8647,1
1,10594,The Skulls III,2004.0,8647,1
2,10380,An American Tail: Fievel Goes West,1991.0,8783,1
3,31473,An American Tail: The Treasure of Manhattan Is...,1998.0,8783,1
4,27653,An American Tail: The Mystery of the Night Mon...,1999.0,8783,1
5,10869,Herbie Rides Again,1974.0,12087,1
6,12129,Herbie Goes Bananas,1980.0,12087,1
7,14140,Herbie Goes to Monte Carlo,1977.0,12087,1
8,14136,The Love Bug,1968.0,12087,1
9,32643,The Love Bug,1997.0,12087,1
