In [26]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('drive', force_remount=True)
from pathlib import Path
import lxml
_domain = 'https://letterboxd.com'
import math

Mounted at drive


In [27]:
def transform_stars(starstring):
    """
    Transforms star rating into float value
    """
    stars = {
        "★": 1,
        "★★": 2,
        "★★★": 3,
        "★★★★": 4,
        "★★★★★": 5,
        "½": 0.5,
        "★½": 1.5,
        "★★½": 2.5,
        "★★★½": 3.5,
        "★★★★½": 4.5
    }
    try:
        return stars[starstring]
    except:
        return np.nan

In [28]:
def get_page_num(user):


  link = 'https://letterboxd.com/'+user+'/films/ratings/'
  
  response = requests.get(link)
  if response.status_code == 200:
    #print('Response Success!')
    pass
  elif response.status_code ==404:
    print("not found.")

  soup = BeautifulSoup(response.content, 'lxml')

  body = soup.find("body")

  try:
        page_link = soup.findAll("li", attrs={"class", "paginate-page"})[-1]
        num_page = int(page_link.find("a").text.replace(',', ''))
        display_name = body.find("section", attrs={"id": "profile-header"}).find("h1", attrs={"class": "title-3"}).text.strip()
  except IndexError:
        num_page = 1
        display_name = None

  print('Number of pages for user ' + "'" +user +"'" + ' is ' +str(num_page))

  return num_page

In [29]:
def get_data(username,page_diff):
  '''
  main function
  needs username and returns movie_DF and user_DF (Pandas DF) with information from 1 user which has all of the informations
  '''
  film_names =[]
  ratings =[]
  release_years =[]
  directors =[]
  casts=[]
  #genres = []
  #themes = []
  average_ratings=[]
  URLs =[]
  lb_users = []

  #num_pages = get_page_num(username)
  num_pages = page_diff
  if num_pages > 800:
    num_pages=800             #having problems with users that have to many entries ****TO DO****

  for i in range(num_pages):
    link = 'https://letterboxd.com/'+username+'/films/ratings/page/'+str(i+1)
    #print(link)
    

    response = requests.get(link)
    if response.status_code == 200:
      print('Scraping page '+str(i+1)+' of '+str(num_pages))
    elif response.status_code ==404:
      print("not found.")
    soup = BeautifulSoup(response.content, 'lxml')
    table = soup.find('ul', class_='poster-list')
    films = table.find_all('li')

    for film in tqdm(films):
      #finding film name
      panel = film.find('div').find('img')
      film_name = panel['alt']

      #try to find the rating of a film if possible and converting to float
      try:
        stars = film.find('span', class_='rating').get_text().strip()
        rating = transform_stars(stars)
      except:
        rating =np.nan
      
      # find each film "card" from the grid of the user list go through each film and pull the data from the individual film page
      film_card = film.find('div').get('data-target-link')
      film_page = _domain + film_card #theoratically we have a '/' too much in there but it works. no reason for cleanup right now
      filmget = requests.get(film_page)
      film_soup = BeautifulSoup(filmget.content, 'lxml')

      release_year = film_soup.find('meta', attrs={'property':'og:title'}).attrs['content'][-5:-1]
      director = film_soup.find('meta', attrs={'name':'twitter:data1'}).attrs['content']
      
      #find cast
      try:
        cast = [ line.contents[0] for line in film_soup.find('div', attrs={'id':'tab-cast'}).find_all('a')]

        #remove all the 'Show all' tags if they are present
        cast =[i for i in cast if i != 'Show All...']
        cast = cast[0:5]                                                        #started with 8 and reduced to 5 now
      except:
        cast =np.nan     

      try:
        average_rating = float(film_soup.find('meta', attrs={'name':'twitter:data2'}).attrs['content'][:4])
      except:
        average_rating = np.nan

      #try:  
       #genre = [ line.contents[0] for line in film_soup.find('div', attrs={'id':'tab-genres'}).find_all('a')]
        #genre =[i for i in genre if i != 'Show All...']
        #turns out that we also have "themes" which are under genre. putting the themes in as genres bloats up my features,
        #so I will split it here into genres and themes. on average every movie has 3 themes so that is where I split. solving this via the scraping is a To-Do 

        #theme= genre[3:]
        #genre = genre[:3]
        
      #except:
        #genre = np.nan  
        #theme=np.nan
      URL= _domain+film_card

      film_soup.decompose

      film_names.append(film_name)
      release_years.append(release_year)
      directors.append(director)
      casts.append(cast)
      average_ratings.append(average_rating)
      URLs.append(_domain+film_card)
      lb_users.append(username)
      ratings.append(rating)
      #genres.append(genre)
      #themes.append(theme)

    movie_DF =pd.DataFrame({'Title': film_names, 'Average_Rating': average_ratings, 'Release_Year': release_years, 'Director': directors,'Cast':casts})#, 'Genres': genres, 'Themes':themes})
    user_DF = pd.DataFrame({'Title': film_names,'User_Rating': ratings, 'lb_username':lb_users})

  return movie_DF, user_DF

In [30]:
path_to_movie_db = 'drive/My Drive/Colab Notebooks/data/movie_db.csv' 
path_to_user_db =  'drive/My Drive/Colab Notebooks/data/user_ratings.csv'
path1 = Path(path_to_movie_db)
path2 = Path(path_to_user_db)

In [34]:
users = ['behaind']

In [35]:
if Path(path_to_user_db).is_file():
  old_user_df = pd.read_csv(path_to_user_db)
  old_users = list(old_user_df['lb_username']) 
  old_users = set(old_users) #changing it to set because they dont allow duplicates
  for user in users:
    if user in old_users: 
      num_movies = len(old_user_df[old_user_df['lb_username']==user]) #number of movies in df from the last pull
      old_num_pages = math.ceil((num_movies/18)) #number of pages from the last pull
      new_num_pages = get_page_num(user)
      page_diff=new_num_pages-old_num_pages
      if  page_diff >0:
        temp_movie_df, temp_user_df = get_data(user,page_diff)  #main function
        temp_movie_df.to_csv("drive/My Drive/Colab Notebooks/data/movie_db.csv",index=False, header=False, mode='a')
        print('movie file saved')
        temp_user_df.to_csv("drive/My Drive/Colab Notebooks/data/user_ratings.csv",index=False, header=False, mode='a')
        print('user file saved')
      else: print('no new data') #still could be that there is some new data but i will just disregard to small updates


      #old_user_df.drop(old_user_df[old_user_df['lb_username']==user].index, inplace=True) #if the user is already in the old user dataset drop them

Number of pages for user 'behaind' is 142
no new data
