In [None]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('drive', force_remount=True)
from pathlib import Path
import lxml
_domain = 'https://letterboxd.com/'

Mounted at drive


In [None]:
def transform_stars(starstring):
    """
    Transforms star rating into float value
    """
    stars = {
        "★": 1,
        "★★": 2,
        "★★★": 3,
        "★★★★": 4,
        "★★★★★": 5,
        "½": 0.5,
        "★½": 1.5,
        "★★½": 2.5,
        "★★★½": 3.5,
        "★★★★½": 4.5
    }
    try:
        return stars[starstring]
    except:
        return np.nan

In [None]:
def get_user_list(num_users_page):
  url = 'https://letterboxd.com/members/popular/this/all-time/'
  users = []
  for i in range(num_users_page):

    response = requests.get(url)
    if response.status_code == 200:
      print('Scraping page '+str(i+1)+' of '+str(num_users_page))
    elif response.satus_code ==404:
      print("not found.")


    #finding user name
    user_page = url  
    userget = requests.get(user_page)
    user_soup = BeautifulSoup(userget.content, 'lxml')

    body = user_soup.find("body")

    for a in user_soup.find_all('a', class_='name', href=True):
      user_temp = a['href']
      user = user_temp.replace("/","")
      

      users.append(user)
      users=list(set(users))   # for some reason it pulls the first 5 entries twice. dont know why. this deletes duplicates. it destroys my order but that doesnt matter

      # check if there is another page of users
    next_button = user_soup.find('a', class_='next')
    if next_button is None:
      #print('none')
      condition = False
    else:
      url = _domain + next_button['href']
      user_soup.decompose

  return users

In [None]:
def get_page_num(user):


  link = 'https://letterboxd.com/'+user+'/films/ratings/'
  
  response = requests.get(link)
  if response.status_code == 200:
    #print('Response Success!')
    pass
  elif response.status_code ==404:
    print("not found.")

  soup = BeautifulSoup(response.content, 'lxml')

  body = soup.find("body")

  try:
        page_link = soup.findAll("li", attrs={"class", "paginate-page"})[-1]
        num_page = int(page_link.find("a").text.replace(',', ''))
        display_name = body.find("section", attrs={"id": "profile-header"}).find("h1", attrs={"class": "title-3"}).text.strip()
  except IndexError:
        num_page = 1
        display_name = None

  print('Number of pages for user ' + "'" +user +"'" + ' is ' +str(num_page))

  return num_page

In [None]:
def get_data(username):
  '''
  main function
  needs username and returns movie_DF and user_DF (Pandas DF) with information from 1 user which has all of the informations
  '''
  film_names =[]
  ratings =[]
  release_years =[]
  directors =[]
  casts=[]
  genres = []
  themes = []
  average_ratings=[]
  URLs =[]
  lb_users = []

  num_pages = get_page_num(username)

  for i in range(num_pages):
    link = 'https://letterboxd.com/'+username+'/films/ratings/page/'+str(i+1)
    print(link)
    

    response = requests.get(link)
    if response.status_code == 200:
      print('Scraping page '+str(i+1)+' of '+str(num_pages))
    elif response.status_code ==404:
      print("not found.")
    soup = BeautifulSoup(response.content, 'lxml')
    table = soup.find('ul', class_='poster-list')
    films = table.find_all('li')

    for film in tqdm(films):
      #finding film name
      panel = film.find('div').find('img')
      film_name = panel['alt']

      #try to find the rating of a film if possible and converting to float
      try:
        stars = film.find('span', class_='rating').get_text().strip()
        rating = transform_stars(stars)
      except:
        rating =np.nan
      
      # find each film "card" from the grid of the user list go through each film and pull the data from the individual film page
      film_card = film.find('div').get('data-target-link')
      film_page = _domain + film_card #theoratically we have a '/' too much in there but it works. no reason for cleanup right now
      filmget = requests.get(film_page)
      film_soup = BeautifulSoup(filmget.content, 'lxml')

      release_year = film_soup.find('meta', attrs={'property':'og:title'}).attrs['content'][-5:-1]
      director = film_soup.find('meta', attrs={'name':'twitter:data1'}).attrs['content']
      
      #find cast
      try:
        cast = [ line.contents[0] for line in film_soup.find('div', attrs={'id':'tab-cast'}).find_all('a')]

        #remove all the 'Show all' tags if they are present
        cast =[i for i in cast if i != 'Show All...']
        cast = cast[0:5]                                                        #started with 8 and reduced to 5 now
      except:
        cast =np.nan     

      try:
        average_rating = float(film_soup.find('meta', attrs={'name':'twitter:data2'}).attrs['content'][:4])
      except:
        average_rating = np.nan

      try:  
        genre = [ line.contents[0] for line in film_soup.find('div', attrs={'id':'tab-genres'}).find_all('a')]
        genre =[i for i in genre if i != 'Show All...']
        #turns out that we also have "themes" which are under genre. putting the themes in as genres bloats up my features,
        #so I will split it here into genres and themes. on average every movie has 3 themes so that is where I split. solving this via the scraping is a To-Do 
        if len(genre)>3:
          theme= genre[3:]
        else:
          theme=np.nan
        genre = genre[:3]
      except:
        genre = np.nan  
      
      URL= _domain+film_card

      film_soup.decompose

      film_names.append(film_name)
      release_years.append(release_year)
      directors.append(director)
      casts.append(cast)
      average_ratings.append(average_rating)
      URLs.append(_domain+film_card)
      lb_users.append(username)
      ratings.append(rating)
      genres.append(genre)
      themes.append(theme)

    movie_DF =pd.DataFrame({'Title': film_names, 'Average_Rating': average_ratings, 'Release_Year': release_years, 'Director': directors,'Cast':casts, 'Genres': genres, 'Themes':themes})
    user_DF = pd.DataFrame({'Title': film_names,'User_Rating': ratings, 'lb_username':lb_users})

  return movie_DF, user_DF

In [None]:
path_to_movie_db = 'drive/My Drive/Colab Notebooks/data/movie_db.csv' 
path_to_user_db =  'drive/My Drive/Colab Notebooks/data/user_ratings.csv'
path1 = Path(path_to_movie_db)
path2 = Path(path_to_user_db)

In [None]:
users = get_user_list(1)
#users.append('swaiborr','behaind', 'csb_de', 'davidehrlich','awesometacular','ignmovies','chrisstuckmann9','robert_hofmann_') specifically pulling these for my test_set when building the ML-model
#users=list(set(users)) # to get rid of duplicates if I create some with the line above

# To Do 
create a check if user already exists in csv file. if user already exists delete them from 'users' so we dont gather data from a user we already have data on

In [None]:
if path2.is_file():
  old_user_df = pd.read_csv(path_to_user_db)
  old_users = list(old_user_df['lb_username']) 
else:
  pass

for old in old_users:
  if old in users: users.remove(old)
 #this should delete every user in users which is already in my "Database". this way I wont scrape the data from them twice. this is a quite brute way of solving this because it doesnt check if the user has new ratings since I last pulled the data

In [None]:
users

In [None]:
movie_df= pd.DataFrame()
user_df= pd.DataFrame()

for user in users:
  print(user)
  temp_movie_df, temp_user_df = get_data(user)
  
  if path1.is_file():
    temp_movie_df.to_csv("drive/My Drive/Colab Notebooks/data/movie_db.csv", mode='a', index=False, header=False) 
  else:
    temp_movie_df.to_csv("drive/My Drive/Colab Notebooks/data/movie_db.csv",index=False, header=True)
    
  if path2.is_file():
    temp_user_df.to_csv("drive/My Drive/Colab Notebooks/data/user_ratings.csv", mode='a', index=False, header=False) 
  else:
    temp_user_df.to_csv("drive/My Drive/Colab Notebooks/data/user_ratings.csv",index=False, header=True)

  #movie_df= pd.concat([movie_df, temp_movie_df],ignore_index=True)
  #user_df= pd.concat([user_df, temp_user_df],ignore_index=True)

    