In [5]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import concurrent.futures
import time
import re
from collections import OrderedDict
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import urllib.parse
from IPython.display import HTML


# VARIABLES
all_genres = ['action','adventure','animation','comedy','crime','documentary','drama','family','fantasy','history','horror','music','mystery','romance','science fiction','thriller','tv movie','war','western']
top_genres = {
    'action': 0,
    'adventure': 0,
    'animation': 0,
    'comedy': 0,
    'crime': 0,
    'documentary': 0,
    'drama': 0,
    'family': 0,
    'fantasy': 0,
    'history': 0,
    'horror': 0,
    'music': 0,
    'mystery': 0,
    'romance': 0,
    'science fiction': 0,
    'thriller': 0,
    'tv movie': 0,
    'war': 0,
    'western': 0
}
list_urls = []
titles = []
ratings_list = []
release_years = []
data = {}

# GETTING THE URLS
## First Page
base_url = 'https://letterboxd.com'
user_name = input('Enter user name: ')
r = requests.get(f'{base_url}/{user_name}/films/diary/')
soup = BeautifulSoup(r.content, "html.parser")
list_urls = [base_url + a.get('href') for a in soup.select('h3>a[href]')]
if list_urls == []:
    print('No films found')
    sys.exit(1)
## Subsequent Pages
links = soup.find_all('li', class_="paginate-page")
try:
    start_character = str(links[-1]).find('/page/') + 6
    end_character = str(links[-1]).find('/">')
    end_page = int(str(links[-1])[start_character:end_character])
except IndexError:
    end_page = 1
if end_page >1:
    for page in range(2, end_page + 1):
        r = requests.get(f'{base_url}/{user_name}/films/diary/page/{page}')
        soup = BeautifulSoup(r.content, "html.parser")
        new_urls = [base_url + a.get('href') for a in soup.select('h3>a[href]')]
        list_urls.extend(new_urls)

# GETTING THE TITLES, RELEASE YEAR, RATINGS, AND GENRES + DATA FORMATTING
for url in list_urls:
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    ## Getting the Titles and Release Year
    span = soup.find_all('span', class_="film-title-wrapper")
    span_soup = BeautifulSoup('\n'.join(map(str, span)))
    a_list = list(span_soup.find_all('a'))
    count = 0
    for a in a_list:
        a_str =str(a)
        start_chr = a_str.find('">')+2
        end_chr = a_str.find("</")
        count += 1
        if count == 1:
            titles.append(a_str[start_chr:end_chr])
        if count == 2:
            release_years.append(a_str[start_chr:end_chr])
        if count == 3:
            print('uh oh, Error at titles/year')
            break
    ## Getting the Ratings
    script_str = str(soup.find_all('script', type="application/ld+json"))
    start_chr = script_str.find('"ratingValue":')+14
    end_chr = script_str.find(',"worstRating"')
    if end_chr == -1:
        ratings_list.append('null')
        continue
    ratings_list.append(script_str[start_chr:end_chr])
    ## Getting the Genres
    all_script_tags = soup.find_all("script")
    script_list = []
    movie_genres = []
    for i in all_script_tags:
        script_list.append(str(i))
    raw_script_tag = script_list[-1]
    for i in all_genres:
        if i in raw_script_tag:
            movie_genres.append(i)
            top_genres[i] += 1
    ## Formatting the Data Dictionary
    if titles[-1] not in data:
        data[titles[-1]] = [release_years[-1],ratings_list[-1], movie_genres]

# SORTING GENRE VALUES
sorted_top_genres = sorted(top_genres.items(), key=lambda item: item[1], reverse=True)

# =====================================================================================================

top_3 = [genre for genre, count in sorted_top_genres[:3]]

soup = BeautifulSoup('', 'html.parser')

# use webdriver to open letterboxd based on rating and genre
driver = webdriver.Chrome()

# get soup of all genres combined
driver.get(f"https://letterboxd.com/films/genre/{'+'.join(top_3)}/")
time.sleep(1)
html_doc = driver.page_source
soup1 = BeautifulSoup(html_doc, 'html.parser')
soup.extend(soup1.contents)
time.sleep(0.5)
# get soup of each genre separately
for i in [0,1,2]:
    for j in ['NA', 'page/2/']: # page2 for more data
        soup2 = j
        if j == 'NA':
            driver.get(f"https://letterboxd.com/films/genre/{top_3[i]}/")
        else:
            driver.get(f"https://letterboxd.com/films/genre/{top_3[i]}/{j}")
        time.sleep(1)
        html_doc = driver.page_source
        soup2 = BeautifulSoup(html_doc, 'html.parser')
        soup.extend(soup2.contents)
        time.sleep(0.5)
# get soup of old movies
for i in [0,1,2]:
    for j in ['9','8','7']: # old movies
        soup2 = j
        driver.get(f"https://letterboxd.com/films/popular/decade/19{j}0s/genre/{top_3[i]}/")
        time.sleep(1)
        html_doc = driver.page_source
        soup2 = BeautifulSoup(html_doc, 'html.parser')
        soup.extend(soup2.contents)
        time.sleep(0.5)

driver.quit()

# parse out the tags that have the film info
initial_poster_lists = soup.find_all('li', class_='listitem poster-container')

# removes duplicate entries
poster_lists = list(OrderedDict.fromkeys(initial_poster_lists))

# get the url
url_list = []
for poster in poster_lists:
    link_tag = poster.find('a')
    link_url = link_tag['href']
    url_list.append(f'https://letterboxd.com{link_url}')

# get the title, date, and rating string
title_date_ratingS = []
for poster in poster_lists:
    info_tag = poster.find('a')
    title_date_rating = info_tag['data-original-title']
    title_date_ratingS.append(title_date_rating)

# get the title and rating, then put in a dictionary
title_rating_dict = {}
pattern = r"([^\(]+) \((\d{4})\) ([\d.]+)"
for info in title_date_ratingS:
    match = re.search(pattern, info)
    title = match.group(1)
    release = match.group(2)
    rating = match.group(3)
    if title in title_rating_dict.keys():
        title += '1'
        title_rating_dict[title] = [release, rating]
    else:
        title_rating_dict[title] = [release, rating]

# function to get genre and theme
def scrape_url(url):
    response = requests.get(url).content
    soup = BeautifulSoup(response, 'html.parser')
    # Extract the desired data from the soup object
    genre_theme_tags = soup.find_all('a', {'href': re.compile(r'/films/genre/(.+)')}) # if want themes: (genre|theme)
    listt = []
    for tag in genre_theme_tags:
        genre_theme = tag.text
        listt.append(genre_theme)
    return listt

# Use concurrent.futures to make all the requests in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    genre_theme_lists = list(executor.map(scrape_url, url_list))

for i in range(len(title_rating_dict)):
    title_rating_dict[list(title_rating_dict.keys())[i]] += [genre_theme_lists[i]]

# convert title_rating_dict to dataframe
search_results_df = pd.DataFrame(title_rating_dict).T
search_results_df = search_results_df.rename(columns={0: 'Release Year', 1: 'Rating', 2: 'Genres'})
search_results_df['movie_url'] = url_list
sorted_search_df = search_results_df.sort_values(by=['Rating'], ascending=False)
sorted_search_df.reset_index(inplace=True)
sorted_search_df = sorted_search_df.rename(columns={'index':'movie title'})

sorted_search_df['Rating'] = sorted_search_df['Rating'].astype(float)
sorted_search_df['Release Year'] = sorted_search_df['Release Year'].astype(int)

#  ====================================================

# remove watched
search_results_df1=sorted_search_df.copy()
diary=list(data.keys())
for entry in diary:
    if entry in list(search_results_df1["movie title"]):
        search_results_df1=search_results_df1.drop(search_results_df1[search_results_df1["movie title"]==entry].index)

def get_rotten_tomatoes_url(movie_name, year):
    year=str(year)
    query = urllib.parse.quote(movie_name)
    url = f"https://www.rottentomatoes.com/search?search={query}"
    response = requests.get(url)
    movie=response.text
    soup = BeautifulSoup(movie, 'html.parser')
    URLS=soup.find_all('a',class_='unset')
    a=[]
    links=[]
    dates=[]
    titles=[]
    x=0
    y=0
    parse_title=["/n"," "]
    for tag in URLS:
        if "thumbnail" in str(tag):
            a.append(tag)
    for tag in a:
        links.append(str(tag)[str(tag).find("https"):(str(tag).find(" slot")-1)])
    datetag=soup.find_all('search-page-media-row')
    for tag in datetag:
        dates.append(str(tag)[str(tag).find("releaseyear")+13:str(tag).find("releaseyear")+17])
    while x!=len(datetag):
        lee=datetag[x].text.replace("\n","")
        lee=lee.replace("  ","")
        titles.append(lee)
        x+=1
    while y!=len(datetag):
        if dates[y]==year and titles[y]==movie_name:
            return links[y]
        y+=1
    return "No Reviews on Rotten Tomatoes"
def get_movie_reviews(movie_title,year):
    url=f""
    if "getrottentomatoesurl" in movie_title:
        movie_title=movie_title.replace("getrottentomatoesurl","")
        url=get_rotten_tomatoes_url(movie_title, year)
        url=url+"/reviews"
        if url:
            url=url
        else:
            return "No reviews found on Rotten Tomatoes"
    else:
        unnecessary_punctuation_second = r"$@[].,'#()-\"!?’_;:/…–"
        y=0
        clean_movie=movie_title
        while y!=len(unnecessary_punctuation_second):
            clean_movie=clean_movie.replace(unnecessary_punctuation_second[y],"")
            y+=1
        movie_string=clean_movie.split()
        x=0
        Movie=""
        while x!=len(movie_string):
            Movie+=movie_string[x]
            if x==len(movie_string)-1:
                ""
            else:
                Movie+="_"
            x+=1
        url = f"https://www.rottentomatoes.com/m/{Movie}/reviews"
    z=0
    response = requests.get(url)
    reviews=response.text
    soup = BeautifulSoup(reviews, 'html.parser')
    presented_review=soup.find_all('p',class_='review-text')[z].text
    review_critic=soup.find_all('a',class_='display-name')[z].text
    score=soup.find_all('score-icon-critics')
    tag=0
    number=len(score)
    score1=score.copy()
    while tag!=number:
        if "4" not in str(score1[tag]):
            score.remove(score1[tag])
        else:
            break
        tag+=1
    score
    score2=score[z]
    senti=str(score2)
    parse_critics = ["\n","  "]
    punct=0
    space=0
    while "NEGATIVE" in senti:
        z+=1
        score2=score[z]
        senti=str(score2)
        presented_review=soup.find_all('p',class_='review-text')[z].text
        review_critic=soup.find_all('a',class_='display-name')[z].text
        
    while punct!=len(parse_critics):
            review_critic=review_critic.replace(parse_critics[punct],"")
            punct+=1
    while space!=len(parse_critics):
            presented_review=presented_review.replace(parse_critics[space]," ")
            space+=1
    return presented_review+" ("+review_critic+", Rotten Tomatoes)"
    


def movie_reception(function, movie_title, year): #MAIN FUNCTION
    
    #--------------------------------------------
    year=str(year)
    try:
        movie_title1=movie_title 
        unnecessary_punctuation_first = r"$@[].,'#()-\"!?’_;:/…–"
        y=0
        while y!=len(unnecessary_punctuation_first):
            movie_title1=movie_title1.replace(unnecessary_punctuation_first[y]," ")
            y+=1   
        movie_title1=movie_title1.replace("&",'and')
        movie_copy=movie_title1
        movie_title1=movie_title1+" "+year
        try:
            return function(movie_title1,year)
        except Exception:      
            return function(movie_copy,year)
    except Exception:
        movie_title2=movie_title
        movie_title2=movie_title2.replace("&",'and')
        movie_copy=movie_title2
        movie_title2=movie_title2+" "+year
        try:
            return function(movie_title2,year)
        except Exception:
            try:
                return function(movie_copy,year)
            except Exception:
                try:
                    movie_title="getrottentomatoesurl"+movie_title
                    return function(movie_title,year)

                except Exception:
                    if function==get_movie_reviews:
                        return "No reviews found on Rotten Tomatoes"
                    else:
                        return "not a valid function"

# convert diary dictionary to dataframe
diary_df = pd.DataFrame(data).T
diary_df = diary_df.rename(columns={0: 'Release Year', 1: 'Rating', 2: 'Genres'})
diary_df['Release Year'] = diary_df['Release Year'].astype(int)
current_year = datetime.now().year
edited_year = pd.cut(diary_df['Release Year'], 
              bins=[1899, current_year - 21, current_year - 6, current_year],
              labels=['old', 'notsorecent', 'recent'])
diary_df['Release Year'] = edited_year

# Function to recommend 15 random movies
def recommend_movies(df):
    # Filter the DataFrame by Year
    recent_movies = df[df['Release Year'] >= current_year - 5]
    notsorecent_movies = df[(df['Release Year'] < current_year - 5) & (df['Release Year'] >= current_year - 20)]
    old_movies = df[df['Release Year'] < current_year - 20]
    
    # Filter the DataFrame to only include movies with a rating of 3.5 or higher
    high_rated_movies1 = recent_movies[recent_movies['Rating'] >= 3.5]
    high_rated_movies2 = notsorecent_movies[notsorecent_movies['Rating'] >= 3.5]
    high_rated_movies3 = old_movies[old_movies['Rating'] >= 3.5]
    
    grouped = diary_df.groupby('Release Year')

    # calculate percentage of movies from the 3 'periods'
    recent_num = round((int(len(grouped.get_group('recent'))) / len(diary_df)) * 15)
    notsorecent_num = round((int(len(grouped.get_group('notsorecent'))) / len(diary_df)) * 15)
    old_num = round((int(len(grouped.get_group('old'))) / len(diary_df)) * 15)

    # Choose 10 random movies from the filtered DataFrame
    recommended_movies = pd.concat([high_rated_movies1.sample(n=recent_num), high_rated_movies2.sample(n=notsorecent_num), high_rated_movies3.sample(n=old_num)], ignore_index=True)
    
    return recommended_movies

# Call the recommendation function
recommended_movies = recommend_movies(search_results_df1)
recommended_movies["movie review"]=recommended_movies.apply(lambda row: movie_reception(get_movie_reviews,row['movie title'],row['Release Year']),axis=1)
recommended_movies["rotten tomatoes link"] = recommended_movies.apply(
    lambda row: f'<a href="{get_rotten_tomatoes_url(row["movie title"], row["Release Year"])}" target="_blank">Rotten Tomatoes Link</a>',
    axis=1
)
# add people_watched and languages
def people_watched(url):
    r = f"{url}members/"
    r = requests.get(r)
    soup = BeautifulSoup(r.content, "html.parser")
    soup_str = str(soup.find('a', class_="tooltip"))
    start_chr = soup_str.find('title="') + 7
    end_chr = soup_str.find('people">')-1
    people_watched = soup_str[start_chr:end_chr]
    return people_watched

recommended_movies['people_watched'] = recommended_movies['movie_url'].apply(lambda x:people_watched(x))

def find_language(url1):
    languages=[]
    url1=url1+"/details/"
    response = requests.get(url1)
    movie_page=response.text
    soup = BeautifulSoup(movie_page, 'html.parser')
    TAGS=soup.find_all('a')
    for tag in TAGS:
        if "language" in str(tag):
            tag=tag.text
            languages.append(tag)
    return set(languages)

recommended_movies['Language'] = recommended_movies['movie_url'].apply(find_language)
recommended_movies = recommended_movies.drop('movie_url', axis=1)
html = recommended_movies.to_html(escape=False)
display(HTML(html))

Enter user name:  omamori


  grouped = diary_df.groupby('Release Year')


Unnamed: 0,movie title,Release Year,Rating,Genres,movie review,rotten tomatoes link,people_watched,Language
0,Avengers: Endgame,2019,3.88,"[Science Fiction, Action, Adventure]","The series may never reach this fever pitch again, but we’re glad we got to watch the plan come together in such a memorable way. (Cory Woodroof, Rotten Tomatoes)",Rotten Tomatoes Link,3128680,"{English, Japanese, Xhosa}"
1,The Banshees of Inisherin,2022,4.11,"[Comedy, Drama]","The comedy is a veil for deep, complex themes that give “The Banshees of Inisherin” humour and pathos. Among these are measured explorations on toxic masculinity, loneliness, and purpose. (Calum Cooper, Rotten Tomatoes)",Rotten Tomatoes Link,1072783,{English}
2,Gen V - Prime Premiere,2023,3.87,"[Science Fiction, Drama, Comedy, Adventure, Action]",No reviews found on Rotten Tomatoes,Rotten Tomatoes Link,32515,{English}
3,Society of the Snow,2023,4.13,"[History, Drama]","Bayona uses his many talents as a director to lead his team into a frightening and hopeful place in this survival tale of man against nature. (Nelson Acosta, Rotten Tomatoes)",Rotten Tomatoes Link,840604,{Spanish}
4,Asteroid City,2023,3.52,"[Comedy, Drama]","Anderson’s latest is near impossible to watch without a grin and is the outcome of utterly enchanting filmmaking from start to finish. (Yasmine Kandil, Rotten Tomatoes)",Rotten Tomatoes Link,908429,{English}
5,Django Unchained,2012,4.28,"[Drama, Western]","By tackling a massive western, the writer and director continues to prove that he has never been short on ambition and gives us his grandest spectacle yet. (Don Shanahan, Rotten Tomatoes)",Rotten Tomatoes Link,2644344,"{English, German, French}"
6,Avatar,2009,3.59,"[Fantasy, Science Fiction, Adventure, Action]","“Avatar” remains a transporting experience – an entertaining blend of old-fashioned adventure and technological wonder. (Keith Garlington, Rotten Tomatoes)",Rotten Tomatoes Link,2606490,"{English, Spanish}"
7,The Dark Knight,2008,4.47,"[Action, Drama, Thriller, Crime]","With a memorable performance from Heath Ledger, masterful direction from Nolan, and bold storytelling, this film stands out as one of the best comic book adaptations in film history. [Full Review in Spanish] (Juan Pablo Russo, Rotten Tomatoes)",Rotten Tomatoes Link,3590980,"{English, Chinese}"
8,Your Name.,2016,4.22,"[Romance, Animation, Drama]","As much as it is very well done, I don’t think it’s worth putting it up on such a high pedestal on par with the works of Hayao Miyazaki. Your Name is good, but for me, it's not worthy of breaking Spirited Away’s record good. (Mat Brunet, Rotten Tomatoes)",Rotten Tomatoes Link,1135541,{Japanese}
9,The Nice Guys,2016,3.83,"[Action, Comedy, Crime]","Black does some of his best visual work ever in “The Nice Guys.” It may be his masterpiece. He creates a pitch perfect period piece set in 1977 Los Angeles. The composition of every scene offers a reward to eagle-eyed viewers. (Sarah Vincent, Rotten Tomatoes)",Rotten Tomatoes Link,1098785,{English}
