In [10]:
import numpy as np, pandas as pd
import sqlite3 as sql
import os
# import seaborn as sns
# import plotly.graph_objs as go
from dotenv import load_dotenv
import sys
sys.path.insert(0, '../data_prep')
from sqlite_utils import get_list_of_tables, get_from_table

load_dotenv()

def print_film_db_entries(film_id):
    try:
        film_title = get_from_table('FILM_TITLE', film_id, 'FILM_TITLE')
    except:
        film_title = 'MISSING!!!'
    for table in get_list_of_tables():
        try:
            entry = get_from_table(table, film_id)
            print('\nEntry for {} ({}) in the {} table is:\n{}'.format(film_title, film_id, table, entry))
        except:
            print('\nNo Entry for {} ({}) in the {} table.'.format(film_title, film_id, table))

def compare_film_db_entries(film_ids):
    for table in get_list_of_tables():
        for film_id in film_ids:
            try:
                film_title = get_from_table('FILM_TITLE', film_id, 'FILM_TITLE')
            except:
                film_title = 'MISSING!!!'
            try:
                entry = get_from_table(table, film_id)
                print('\nEntry for {} ({}) in the {} table is:\n{}'.format(film_title, film_id, table, entry))
            except:
                print('\nNo Entry for {} ({}) in the {} table.'.format(film_title, film_id, table))




print_film_db_entries('f_0hTha')


Entry for Parasite (f_0hTha) in the FILM_YEAR table is:
{'FILM_ID': 'f_0hTha', 'FILM_YEAR': 2019, 'FILM_DECADE': '2010s', 'CREATED_AT': '2023-05-25 18:41:56.053461'}

Entry for Parasite (f_0hTha) in the FILM_LETTERBOXD_STATS table is:
{'FILM_ID': 'f_0hTha', 'FILM_WATCH_COUNT': 3527322, 'FILM_FAN_COUNT': 83457, 'FILM_LIKES_COUNT': 2032650, 'FILM_REVIEW_COUNT': 295813, 'FILM_LIST_COUNT': 493539, 'FILM_RATING': 4.57, 'FILM_RATING_COUNT': 2600927, 'CREATED_AT': '2024-02-27 16:54:26.145539'}

Entry for Parasite (f_0hTha) in the FILM_STREAMING_SERVICES table is:
{'FILM_ID': 'f_0hTha', 'STREAMING_SERVICE_ABBR': 'netflix', 'STREAMING_SERVICE_FULL': 'Netflix', 'CREATED_AT': '2024-02-02 16:48:43.277617', 'PRICE': 0, 'VALID': 1}

Entry for Parasite (f_0hTha) in the FILM_GENRE table is:
{'FILM_ID': 'f_0hTha', 'FILM_GENRE': 'comedy', 'ALL_FILM_GENRES': 'comedy/drama/thriller', 'CREATED_AT': '2023-05-25 18:41:56.064533'}

Entry for Parasite (f_0hTha) in the INGESTED table is:
{'FILM_ID': 'f_0hTha',

In [14]:
import requests
from bs4 import BeautifulSoup
film_id = 'f_012Ci'
film_url_title = get_from_table('FILM_URL_TITLE', film_id, 'FILM_URL_TITLE')
initial_url = 'https://letterboxd.com/film/{}/members/rated/.5-5/'.format(film_url_title)
r = requests.get(initial_url)

soup = BeautifulSoup(r.content, 'lxml')

'dead-kids-1981'

In [8]:
soup.find_all('a', {'class': 'tooltip'})

[<a class="tooltip" href="/film/dead-kids-1981/members/" title="6,439 people">
 				Members
 			</a>,
 <a class="tooltip" href="/film/dead-kids-1981/fans/" title="1 fan">
 				Fans
 			</a>,
 <a class="tooltip" href="/film/dead-kids-1981/likes/" title="1,381 likes">
 				Likes
 			</a>,
 <a class="tooltip" href="/film/dead-kids-1981/reviews/" title="1,394 reviews">
 				Reviews
 			</a>,
 <a class="tooltip" href="/film/dead-kids-1981/lists/" title="4,721 lists">
 				Lists
 			</a>,
 <a class="has-icon icon-16 icon-review tooltip" data-username="rejectedslasher" href="/rejectedslasher/film/dead-kids-1981/" title="Read review">Review</a>,
 <a class="has-icon icon-16 icon-review tooltip" data-username="ghlewmer" href="/ghlewmer/film/dead-kids-1981/" title="Read review">Review</a>,
 <a class="has-icon icon-16 icon-review tooltip" data-username="shawnr60" href="/shawnr60/film/dead-kids-1981/" title="Read review">Review</a>,
 <a class="trigger tooltip" href="https://twitter.com/letterboxd" 

In [6]:
soup

<!DOCTYPE html>
<html class="no-mobile no-js" id="html" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=1024" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="Everyone who has watched Dead Kids" name="description"/>
<meta content="https://letterboxd.com/film/dead-kids-1981/members/rated/.5-5/" property="og:url"/>
<meta content="Everyone who has watched Dead Kids" property="og:title"/>
<meta content="Everyone who has watched Dead Kids" property="og:description"/>
<meta content="https://s.ltrbxd.com/static/img/default-share.e38c5d62.png" property="og:image"/>
<meta content="Letterboxd" name="application-name"/>
<meta content="#14181C" name="theme-color"/>
<meta content="#445566" name="msapplication-TileColor"/>
<meta content="app-id=1054271011, affiliate-data=11l5KW, app-argument=https://letterboxd.com/film/dead-kids-1981/members/rated/.5-5/" name="apple-itunes-app"/>
<meta content="yes" name="mobile-web-app-capable"/>
<title

In [None]:

metrics_dict = {}
for i in ['members', 'fans', 'likes', 'reviews', 'lists']:
    href_str = '/film/{}/{}/'.format(film_url_title, i)
    try:
        metric_string = soup.find('a', {'class': 'tooltip', 'href':href_str}).get('title')
        metric = int(metric_string[:metric_string.find('\xa0')].replace(',', ''))
    except:
        metric = 0
    metrics_dict[i] = metric

In [9]:
letterboxd_exports_folder_dir = os.path.join(os.getenv('PROJECT_PATH'), 'db/raw_exports')
letterboxd_export_folders = os.listdir(letterboxd_exports_folder_dir)
len(letterboxd_export_folders)

34

In [34]:
[x/4 for x in range(5)]

[0.0, 0.25, 0.5, 0.75, 1.0]

In [23]:
ind_to_keep = [int((len(letterboxd_export_folders)-1) * x) for x in  [0, 0.25, 0.5, 0.75, 1]]
ind_to_keep

[0, 8, 16, 24, 33]

In [53]:
folders_to_keep = 5

raw_ind_to_keep = [x/(folders_to_keep-1) for x in range(folders_to_keep)]
ind_to_keep = [int((len(letterboxd_export_folders)-1) * x) for x in raw_ind_to_keep]
folders_to_keep = [sorted(letterboxd_export_folders)[x] for x in ind_to_keep]
folders_to_delete = [x for x in letterboxd_export_folders if x not in folders_to_keep]
folders_to_keep

['letterboxd-td1138-2023-04-05-09-18-utc',
 'letterboxd-td1138-2023-06-13-15-54-utc',
 'letterboxd-td1138-2023-06-26-10-06-utc',
 'letterboxd-td1138-2023-07-04-14-41-utc',
 'letterboxd-td1138-2023-07-31-23-48-utc']

In [54]:
raw_ind_to_keep

[0.0, 0.25, 0.5, 0.75, 1.0]

In [55]:
folders_to_delete

['letterboxd-td1138-2023-06-02-14-42-utc',
 'letterboxd-td1138-2023-06-05-05-38-utc',
 'letterboxd-td1138-2023-06-05-07-52-utc',
 'letterboxd-td1138-2023-06-06-11-18-utc',
 'letterboxd-td1138-2023-06-07-16-52-utc',
 'letterboxd-td1138-2023-06-08-11-27-utc',
 'letterboxd-td1138-2023-06-13-10-08-utc',
 'letterboxd-td1138-2023-06-13-16-42-utc',
 'letterboxd-td1138-2023-06-14-10-39-utc',
 'letterboxd-td1138-2023-06-15-11-49-utc',
 'letterboxd-td1138-2023-06-19-21-58-utc',
 'letterboxd-td1138-2023-06-20-16-10-utc',
 'letterboxd-td1138-2023-06-22-16-15-utc',
 'letterboxd-td1138-2023-06-23-17-23-utc',
 'letterboxd-td1138-2023-06-26-23-06-utc',
 'letterboxd-td1138-2023-06-27-20-21-utc',
 'letterboxd-td1138-2023-06-27-23-22-utc',
 'letterboxd-td1138-2023-06-28-00-21-utc',
 'letterboxd-td1138-2023-06-28-14-23-utc',
 'letterboxd-td1138-2023-06-30-14-36-utc',
 'letterboxd-td1138-2023-07-03-10-34-utc',
 'letterboxd-td1138-2023-07-10-13-06-utc',
 'letterboxd-td1138-2023-07-13-09-08-utc',
 'letterbox

In [24]:
tmp = pd.DataFrame(sorted(letterboxd_export_folders), columns=['export'])
tmp['keep'] = [np.where(x in ind_to_keep, 1, 0) for x in tmp.index]
tmp

Unnamed: 0,export,keep
0,letterboxd-td1138-2023-04-05-09-18-utc,1
1,letterboxd-td1138-2023-06-02-14-42-utc,0
2,letterboxd-td1138-2023-06-05-05-38-utc,0
3,letterboxd-td1138-2023-06-05-07-52-utc,0
4,letterboxd-td1138-2023-06-06-11-18-utc,0
5,letterboxd-td1138-2023-06-07-16-52-utc,0
6,letterboxd-td1138-2023-06-08-11-27-utc,0
7,letterboxd-td1138-2023-06-13-10-08-utc,0
8,letterboxd-td1138-2023-06-13-15-54-utc,1
9,letterboxd-td1138-2023-06-13-16-42-utc,0


In [None]:
if len(letterboxd_export_folders) > 5:
    

In [None]:


if len(letterboxd_export_folders) == 5:
    folders_to_delete = [letterboxd_export_folders[1], letterboxd_export_folders[3]]
    for folder in folders_to_delete:
        shutil.rmtree(os.getenv('PROJECT_PATH')+'/db/raw_exports/'+folder)



files

In [3]:
print_film_db_entries('f_023Uu')


Entry for Southland Tales (f_023Uu) in the FILM_YEAR table is:
{'FILM_ID': 'f_023Uu', 'FILM_YEAR': 2006, 'FILM_DECADE': '2000s', 'CREATED_AT': '2023-05-25 17:16:44.314366'}

Entry for Southland Tales (f_023Uu) in the FILM_LETTERBOXD_STATS table is:
{'FILM_ID': 'f_023Uu', 'FILM_WATCH_COUNT': 38186, 'FILM_FAN_COUNT': 339, 'FILM_LIKES_COUNT': 7520, 'FILM_REVIEW_COUNT': 8600, 'FILM_LIST_COUNT': 12394, 'FILM_TOP_250': None, 'FILM_RATING': 2.9, 'FILM_RATING_COUNT': 23177, 'CREATED_AT': '2023-05-23 14:47:55.357009'}

Entry for Southland Tales (f_023Uu) in the FILM_STREAMING_SERVICES table is:
{'FILM_ID': 'f_023Uu', 'STREAMING_SERVICE_ABBR': 'ast', 'STREAMING_SERVICE_FULL': 'Lionsgate+', 'CREATED_AT': '2023-05-30 10:03:21.151106', 'PRICE': 0, 'VALID': 1}

Entry for Southland Tales (f_023Uu) in the FILM_GENRE table is:
{'FILM_ID': 'f_023Uu', 'FILM_GENRE': 'comedy', 'ALL_FILM_GENRES': 'comedy/science-fiction/thriller', 'CREATED_AT': '2023-05-25 17:16:44.326227'}

Entry for Southland Tales (f_02

In [4]:
print_film_db_entries('f_01U58')


Entry for Salem's Lot (f_01U58) in the FILM_YEAR table is:
{'FILM_ID': 'f_01U58', 'FILM_YEAR': 1979, 'FILM_DECADE': '1970s', 'CREATED_AT': '2023-05-25 19:14:32.607036'}

Entry for Salem's Lot (f_01U58) in the FILM_LETTERBOXD_STATS table is:
{'FILM_ID': 'f_01U58', 'FILM_WATCH_COUNT': 32150, 'FILM_FAN_COUNT': 51, 'FILM_LIKES_COUNT': 6924, 'FILM_REVIEW_COUNT': 3825, 'FILM_LIST_COUNT': 15246, 'FILM_TOP_250': None, 'FILM_RATING': 3.3, 'FILM_RATING_COUNT': 19872, 'CREATED_AT': '2023-05-15 17:27:05.078580'}

Entry for Salem's Lot (f_01U58) in the FILM_STREAMING_SERVICES table is:
{'FILM_ID': 'f_01U58', 'STREAMING_SERVICE_ABBR': 'rent', 'STREAMING_SERVICE_FULL': 'Rental', 'CREATED_AT': '2023-05-24 12:04:12.084305', 'PRICE': 3.49, 'VALID': 1}

Entry for Salem's Lot (f_01U58) in the FILM_GENRE table is:
{'FILM_ID': 'f_01U58', 'FILM_GENRE': 'drama', 'ALL_FILM_GENRES': 'drama/mystery', 'CREATED_AT': '2023-05-25 19:14:32.620051'}

Entry for Salem's Lot (f_01U58) in the INGESTED table is:
{'FILM_ID

In [5]:
print_film_db_entries('f_06BLY')


Entry for The Last Showing (f_06BLY) in the FILM_YEAR table is:
{'FILM_ID': 'f_06BLY', 'FILM_YEAR': 2014, 'FILM_DECADE': '2010s', 'CREATED_AT': '2023-05-25 20:08:29.268661'}

Entry for The Last Showing (f_06BLY) in the FILM_LETTERBOXD_STATS table is:
{'FILM_ID': 'f_06BLY', 'FILM_WATCH_COUNT': 1226, 'FILM_FAN_COUNT': 0, 'FILM_LIKES_COUNT': 137, 'FILM_REVIEW_COUNT': 183, 'FILM_LIST_COUNT': 438, 'FILM_TOP_250': None, 'FILM_RATING': 2.45, 'FILM_RATING_COUNT': 900, 'CREATED_AT': '2023-05-15 23:42:38.363709'}

Entry for The Last Showing (f_06BLY) in the FILM_STREAMING_SERVICES table is:
{'FILM_ID': 'f_06BLY', 'STREAMING_SERVICE_ABBR': 'rent', 'STREAMING_SERVICE_FULL': 'Rental', 'CREATED_AT': '2023-05-25 21:41:31.421478', 'PRICE': 3.49, 'VALID': 1}

Entry for The Last Showing (f_06BLY) in the FILM_GENRE table is:
{'FILM_ID': 'f_06BLY', 'FILM_GENRE': 'thriller', 'ALL_FILM_GENRES': 'thriller', 'CREATED_AT': '2023-05-25 20:08:29.280417'}

Entry for The Last Showing (f_06BLY) in the INGESTED tab

In [6]:
compare_film_db_entries(['f_01U58', 'f_0lJBs'])


Entry for Salem's Lot (f_01U58) in the FILM_YEAR table is:
{'FILM_ID': 'f_01U58', 'FILM_YEAR': 1979, 'FILM_DECADE': '1970s', 'CREATED_AT': '2023-05-25 19:14:32.607036'}

Entry for Tube Tales (f_0lJBs) in the FILM_YEAR table is:
{'FILM_ID': 'f_0lJBs', 'FILM_YEAR': 1975, 'FILM_DECADE': '1970s', 'CREATED_AT': '2023-05-25 18:38:30.581678'}

Entry for Salem's Lot (f_01U58) in the FILM_LETTERBOXD_STATS table is:
{'FILM_ID': 'f_01U58', 'FILM_WATCH_COUNT': 32150, 'FILM_FAN_COUNT': 51, 'FILM_LIKES_COUNT': 6924, 'FILM_REVIEW_COUNT': 3825, 'FILM_LIST_COUNT': 15246, 'FILM_TOP_250': None, 'FILM_RATING': 3.3, 'FILM_RATING_COUNT': 19872, 'CREATED_AT': '2023-05-15 17:27:05.078580'}

Entry for Tube Tales (f_0lJBs) in the FILM_LETTERBOXD_STATS table is:
{'FILM_ID': 'f_0lJBs', 'FILM_WATCH_COUNT': 550, 'FILM_FAN_COUNT': 0, 'FILM_LIKES_COUNT': 147, 'FILM_REVIEW_COUNT': 99, 'FILM_LIST_COUNT': 189, 'FILM_TOP_250': None, 'FILM_RATING': 3.34, 'FILM_RATING_COUNT': 271, 'CREATED_AT': '2023-05-16 00:25:33.865053