In [121]:
import pandas as pd
from tqdm import tqdm
from tmdbv3api import Collection

import sys
sys.path.insert(0, '../data_prep')
from sqlite_utils import select_statement_to_df, get_from_table

In [122]:
all_collections_statement = """

SELECT COLLECTION_ID, COLLECTION_NAME, COUNT(*) AS FILM_COUNT
FROM FILM_COLLECTIONS
WHERE COLLECTION_ID != -1
GROUP BY COLLECTION_ID

"""

In [123]:
all_collections_df = select_statement_to_df(all_collections_statement)
all_collections_df

Unnamed: 0,COLLECTION_ID,COLLECTION_NAME,FILM_COUNT
0,10,Star Wars Collection,9
1,84,Indiana Jones Collection,5
2,119,The Lord of the Rings Collection,3
3,131,Three Colors Collection,3
4,151,Star Trek: The Original Series Collection,6
...,...,...,...
859,1105385,Tom Cody Collection,1
860,1117396,Death Trilogy,2
861,1118142,Grindhouse Collection,2
862,1118596,The Pope's Exorcist,1


In [124]:
def check_if_tmdb_id_ingested(tmdb_id):
    film_ingested = 0
    if len(select_statement_to_df('SELECT FILM_ID FROM TMDB_ID WHERE TMDB_ID = {}'.format(tmdb_id)).values) == 1:
        film_ingested = 1
    return film_ingested

In [125]:
def get_missing_films(collection_id):
    collection = Collection()
    collection_details = collection.details(collection_id)
    tmp_df = pd.DataFrame([{'TMDB_ID': x.id, 'FILM_TITLE': x.title, 'FILM_YEAR': x['release_date'][:4]}for x in collection_details['parts']])
    tmp_df['MISSING'] = 1 - tmp_df['TMDB_ID'].apply(check_if_tmdb_id_ingested)
    return tmp_df

In [126]:
all_collection_films = []
bad_collection_ids = []
for collection_id in all_collections_df['COLLECTION_ID']:
    try:
        all_collection_films.append(get_missing_films(collection_id))
    except:
        bad_collection_ids.append(collection_id)

In [127]:
bad_collection_ids

[846090, 1117396, 1118142, 1118596]

In [128]:
all_collection_films_df = pd.concat(all_collection_films).reset_index(drop=True)
all_collection_films_df

Unnamed: 0,TMDB_ID,FILM_TITLE,FILM_YEAR,MISSING
0,11,Star Wars,1977,0
1,1891,The Empire Strikes Back,1980,0
2,1892,Return of the Jedi,1983,0
3,1893,Star Wars: Episode I - The Phantom Menace,1999,0
4,1894,Star Wars: Episode II - Attack of the Clones,2002,0
...,...,...,...,...
2767,1811,Nowhere,1997,0
2768,14746,Streets of Fire,1984,0
2769,105040,Road to Hell,2008,1
2770,619264,The Platform,2019,0


In [129]:
MISSING = all_collection_films_df[all_collection_films_df['MISSING'] == 1].reset_index(drop=True)

In [130]:
len(MISSING)

783

In [134]:
[print(x, y) for x, y in zip(MISSING['FILM_TITLE'], MISSING['FILM_YEAR'])]

Last Friday 
National Treasure 3 
Cinderella II: Dreams Come True 2002
Return to Never Land 2002
Return to the Blue Lagoon 1991
TRON: Ares 
Open Season 2 2008
Open Season 3 2010
Open Season: Scared Silly 2015
Ip Man 4: The Finale 2019
Kung Fu Panda 4 2024
The Blood of a Poet 1932
Testament of Orpheus 1960
Beverly Hills Cop II 1987
Beverly Hills Cop III 1994
Beverly Hills Cop: Axel Foley 
District 13: Ultimatum 2009
My Girl 2 1994
Legally Blonde 3 
The Return of Jafar 1994
Aladdin and the King of Thieves 1996
Dirty Dancing: Havana Nights 2004
Despicable Me 4 2024
Vacancy 2: The First Cut 2008
Terror Trap 2010
Open Water 2: Adrift 2006
Cage Dive 2017
Johnny English Reborn 2011
Johnny English Strikes Again 2018
The Hangover Part III 2013
The Hangover Part II 2011
The Saint in New York 1938
The Saint in London 1939
The Saint In Palm Springs 1941
The Saint Strikes Back 1939
The Saint Takes Over 1940
The Saint's Vacation 1941
The Saint Meets the Tiger 1941
Cocoon: The Return 1988
Basic Insti

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

0                                  Last Friday
1                          National Treasure 3
2              Cinderella II: Dreams Come True
3                         Return to Never Land
4                    Return to the Blue Lagoon
                        ...                   
778    Left Behind: Vanished - Next Generation
779                                    Smile 2
780                                     Fall 2
781                               Road to Hell
782                             The Platform 2
Name: FILM_TITLE, Length: 783, dtype: object

In [131]:
round = -1
increment = 50

In [132]:
round += 1
MISSING.head(increment*(round+1)).tail(increment)

Unnamed: 0,TMDB_ID,FILM_TITLE,FILM_YEAR,MISSING
0,487560,Last Friday,,1
1,983058,National Treasure 3,,1
2,14128,Cinderella II: Dreams Come True,2002.0,1
3,16690,Return to Never Land,2002.0,1
4,13888,Return to the Blue Lagoon,1991.0,1
5,533533,TRON: Ares,,1
6,13690,Open Season 2,2008.0,1
7,51170,Open Season 3,2010.0,1
8,382517,Open Season: Scared Silly,2015.0,1
9,449924,Ip Man 4: The Finale,2019.0,1
