In [1]:
import requests
import re
import json
import numpy as np
from datetime import datetime
from bs4 import BeautifulSoup
import sys
sys.path.insert(0, '../data_prep')
from sqlite_utils import get_from_table

In [2]:
film_id = 'f_0tSXS'
film_title = get_from_table('FILM_TITLE', film_id, 'FILM_TITLE')
letterboxd_url = get_from_table('FILM_TITLE', film_id, 'LETTERBOXD_URL')
print('Film ID is "{}", film title is "{}", and url is {}'.format(film_id, film_title, letterboxd_url))

Film ID is "f_0tSXS", film title is "Sudden Death", and url is https://boxd.it/tSXS


In [3]:
r = requests.get(letterboxd_url)
r

<Response [404]>

True

In [62]:
redirected_url = r.url
if letterboxd_url != redirected_url:
    r = requests.get(redirected_url)

In [63]:
soup = BeautifulSoup(r.content, 'lxml')

In [67]:
tmdb_url = soup.find('a', {'data-track-action': 'TMDb'}).get('href')
tmdb_id = tmdb_url.replace('https://www.themoviedb.org/movie/', '').replace('/', '')

'17473'

In [None]:
<a href="https://www.themoviedb.org/movie/104846/" class="micro-button track-event" data-track-action="TMDb">TMDb</a>

In [6]:
og_url = soup.find('meta', {'property': 'og:url'}).get('content')
film = og_url.split('/')[-2]
try:
    year = int(list(re.search(r'\((.*?)\)', soup.find('meta', {'property': 'og:title'}).get('content')).groups())[0])
except:
    year = int(datetime.now().strftime('%Y')) + 2
genre_list = [x.get('href').replace('/films/genre/', '').replace('/', '') for x in soup.findAll('a', {'class':'text-slug'}) if 'genre' in str(x.get('href'))]

rating_dict = json.loads(soup.find('script', {'type':"application/ld+json"}).string.split('\n')[2]).get('aggregateRating')
try:
    rating_mean = rating_dict.get('ratingValue')
except:
    rating_mean = np.nan
try:
    rating_count = rating_dict.get('ratingCount')
except:
    rating_count = np.nan

In [7]:
rating_count

426

In [8]:
r = requests.get('https://letterboxd.com/film/{}/members/rated/.5-5/'.format(film))

In [9]:
soup = BeautifulSoup(r.content, 'lxml')

In [10]:
metrics_dict = {}
for i in ['members', 'fans', 'likes', 'reviews', 'lists']:
    href_str = '/film/{}/{}/'.format(film, i)
    try:
        metric_string = soup.find('a', {'class': 'tooltip', 'href':href_str}).get('title')
        metric = int(metric_string[:metric_string.find('\xa0')].replace(',', ''))
    except:
        metric = 0
    metrics_dict[i] = metric

In [11]:
metrics_dict

{'members': 582, 'fans': 0, 'likes': 72, 'reviews': 80, 'lists': 432}

In [12]:
r = requests.get('https://letterboxd.com/esi/film/{}/stats/'.format(film))
soup = BeautifulSoup(r.content, 'lxml')

In [14]:
try:
    top_ = int(soup.find('a', {'class': 'has-icon icon-top250 icon-16 tooltip'}).text)
except:
    top_ = None
letterboxd_info_dict = {
    'FILM_ID': film_id,
    'FILM_WATCH_COUNT': metrics_dict['members'],
    'FILM_FAN_COUNT': metrics_dict['fans'],
    'FILM_LIKES_COUNT': metrics_dict['likes'],
    'FILM_REVIEW_COUNT': metrics_dict['reviews'],
    'FILM_LIST_COUNT': metrics_dict['lists'],
    'FILM_TOP_250': top_,
    'FILM_RATING': rating_mean,
    'FILM_RATING_COUNT': rating_count,
}
letterboxd_info_dict

{'FILM_ID': 'f_016QI',
 'FILM_WATCH_COUNT': 582,
 'FILM_FAN_COUNT': 0,
 'FILM_LIKES_COUNT': 72,
 'FILM_REVIEW_COUNT': 80,
 'FILM_LIST_COUNT': 432,
 'FILM_TOP_250': None,
 'FILM_RATING': 2.62,
 'FILM_RATING_COUNT': 426}

In [15]:
film_genre_dict = {
        'FILM_ID': film_id,
        'FILM_GENRE':genre_list[0],
        'ALL_FILM_GENRES': '/'.join(genre_list)
    }
film_genre_dict

{'FILM_ID': 'f_016QI', 'FILM_GENRE': 'horror', 'ALL_FILM_GENRES': 'horror'}

In [57]:
from justwatch import JustWatch
# Update streaming info function:
just_watch = JustWatch(country='GB')
film_url_title = get_from_table('FILM_TITLE', film_id, 'FILM_URL_TITLE')
film_release_year = get_from_table('FILM_YEAR', film_id, 'FILM_YEAR')
results = just_watch.search_for_item(query=film_url_title, release_year_from=film_release_year-1, release_year_until=film_release_year+1)
first_result = results['items'][0]


In [58]:
first_result

{'jw_entity_id': 'tm137445',
 'id': 137445,
 'title': 'The Room',
 'full_path': '/uk/movie/the-room-2003',
 'full_paths': {'MOVIE_DETAIL_OVERVIEW': '/uk/movie/the-room-2003'},
 'poster': '/poster/176218423/{profile}',
 'poster_blur_hash': 'TRBzg^tQ4Ug2t7MyaLV[x[DjWBx[',
 'original_release_year': 2003,
 'object_type': 'movie',
 'scoring': [{'provider_type': 'jw:popular_1_day', 'value': 3.34},
  {'provider_type': 'imdb:votes', 'value': 91334},
  {'provider_type': 'tmdb:score', 'value': 4.024},
  {'provider_type': 'jw:popular_7_day', 'value': 3.28},
  {'provider_type': 'jw:popular_30_day', 'value': 11.47},
  {'provider_type': 'imdb:multiplied', 'value': 328802.4},
  {'provider_type': 'tmdb:votes', 'value': 1219},
  {'provider_type': 'imdb:popularity', 'value': 1633},
  {'provider_type': 'tmdb:popularity', 'value': 10.898},
  {'provider_type': 'jw:trending_7_day', 'value': 3.04},
  {'provider_type': 'tmdb:id', 'value': 17473},
  {'provider_type': 'jw:popular_180_day', 'value': 23.41},
  {'

In [45]:
results2 = just_watch.search_for_item(query='jumanji', release_year_from=1995, release_year_until=1995)
first_result2 = results2['items'][0]


dict_keys(['jw_entity_id', 'id', 'title', 'full_path', 'full_paths', 'poster', 'poster_blur_hash', 'original_release_year', 'object_type', 'offers', 'scoring', 'cinema_release_date'])

[]

In [50]:
if first_result.get('title') == get_from_table('FILM_TITLE', film_id, 'FILM_TITLE'):
    provider_abbreviations = list(set([x['package_short_name'] for x in first_result.get('offers', []) if x['monetization_type'] in ['flatrate', 'free', 'ads']]))

In [24]:
first_result.keys()

dict_keys(['jw_entity_id', 'id', 'title', 'full_path', 'full_paths', 'poster', 'poster_blur_hash', 'original_release_year', 'object_type', 'scoring'])

In [None]:
if first_result.get('title') == get_from_table('FILM_TITLE', film_id, 'FILM_TITLE'):
    provider_abbreviations = list(set([x['package_short_name'] for x in first_result['offers'] if x['monetization_type'] in ['flatrate', 'free', 'ads']]))
    valid_abbr = [x for x in provider_abbreviations if x in my_streaming_services_abbr]
    if len(valid_abbr) > 0:
        insert_record_into_table({'FILM_ID':film_id}, 'FILMS_AVAILABLE_TO_STREAM')
        valid_full = [abbr_to_full_dict.get(x) for x in valid_abbr]
        film_streaming_services_df = pd.DataFrame(index=range(len(valid_abbr)))
        film_streaming_services_df['FILM_ID'] = film_id
        film_streaming_services_df['STREAMING_SERVICE_ABBR'] = valid_abbr
        film_streaming_services_df['STREAMING_SERVICE_FULL'] = valid_full
        df_to_table(film_streaming_services_df, 'FILM_STREAMING_SERVICES', replace_append='append', verbose=True)