# Imports

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas.core.frame import DataFrame
import time
import json

# Extraction

## Classes

In [10]:
class BoxOfficeMojo():
  url_base ="https://www.boxofficemojo.com"
  url = ""
  frequency = "year"
  since_year = 0
  to_year = 0
  actual_year = 0
  query_order = "?grossesOption=totalGrosses&releaseScale=wide"

  def __init__(self, since_year, to_year):
    self.since_year = since_year
    self.to_year = to_year
    self.actual_year = since_year

  def built_url(self):
    self.url = self.url_base + "/" + self.frequency + "/" + str(self.actual_year) + "/" + self.query_order

  def get_next_url(self):
    if self.actual_year < self.to_year:
      self.built_url()
      self.actual_year = self.actual_year + 1
    else:
      self.url = ""

    return self.url

  def get_page_content(self):
    url = self.get_next_url()
    page = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    content = BeautifulSoup(page.content)
    tr_list = content.find_all("tr")
    #remove label cols
    tr_list.pop(0)

    movies_list = []
    for tr in tr_list:
      movie = []
      for index, td in enumerate(tr):
        text = td.get_text().strip()
        if index == 1 or index == 5 or index == 6 or index == 7 or index == 10 or index == 12:
          #if text != "" and text != "true" and text != "false":
          if text == "-":
            text = "0"
          elif index != 2 and index != 11:
            text = text.replace(',', '')
            text = text.replace('$', '')
          movie.append(text)

      # Transform release date to full release date complete
      movie[4] = self.transform_release_date(movie[4])
      
      # Don't append movies that have distributor Fathom Events 
      # or movies that we don't have their opening weekend gross
      distributor = movie[5]
      opening_weekend_gross = int(movie[3])
      if distributor != "Fathom Events" and opening_weekend_gross > 0:
        movies_list.append(movie)
    
    return movies_list
  
  def transform_release_date(self, release_date):
    release_date = release_date.split(" ")
    month = release_date[0]
    day = release_date[1]

    if month == 'Jan':
      month = '01'
    elif month == 'Feb':
      month = '02'
    elif month == 'Mar':
      month = '03'
    elif month == 'Apr':
      month = '04'
    elif month == 'May':
      month = '05'
    elif month == 'Jun':
      month = '06'
    elif month == 'Jul':
      month = '07'
    elif month == 'Aug':
      month = '08'
    elif month == 'Sep':
      month = '09'
    elif month == 'Oct':
      month = '10'
    elif month == 'Nov':
      month = '11'
    elif month == 'Dec':
      month = '12'

    return str(self.actual_year-1)+"/"+str(month)+"/"+str(day)

  def extract_data(self):
    print(f'Box Mojo Office. Initializing process')
    total_years = (self.to_year - self.since_year)
    movies_list = []
    while total_years > 0:
      print(f'Box Mojo Office. Extracting data... Year:{self.actual_year}')      
      movies = self.get_page_content()
      movies_list.extend(movies)
      total_years = total_years - 1

    print(f'Box Mojo Office. Data extracted!') 
    return movies_list

In [11]:
class Filmaffinity():
  url ="https://www.filmaffinity.com/en/search.php?stext="
  cont:int=0

  #Metodo para obtener la url que corresponde a la pelicula
  def get_movie_url(self,movie_title):
    movie_url=""
    url=self.url+movie_title
    page = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
    content = BeautifulSoup(page.content)
    try:
      #Recorre la pagina y obtiene todos los resultados de una pelicula
      movies_div_list=content.find_all("div",{"class":"mc-title"})
      if movies_div_list:
        movies_url_list=[]
        for div in movies_div_list:
          movies_url_list.append(div.find("a"))
        #Comprobacion de que el titulo de la pelicula coincide exactamente
        #con el titulo que le hemos proporcionado, recorriendo los resultados    
        for movie in movies_url_list:
          if movie["title"].lower() == movie_title.lower()+" ":
            movie_url=movie["href"]
        #En caso de que no encuentre la pelicula, coje el primer resultado    
        if movie_url=="":
          movie_url=movies_url_list[0]["href"]
      else:
        #En caso de que solo exista un resultado entra directamente a la pagina
        #por lo que se obtiene la url de la pelicula con la siguiente linea
        movie_url=content.find_all("meta", property="og:url")[0]["content"]
    except:
      #Too many request(Pasa cuando haces demasiadas peticiones)
      print("Too many request")
    self.cont=self.cont+1
    #Hace 50 peticiones cada 5 minutos para que no salte el error de demasiadas peticiones  
    if self.cont>=50:
      time.sleep(300)
      self.cont=0
    return movie_url
    
  #Metodo para obtener la nota correspondiente a la pelicula
  def extract_movie_score(self, content):
    score=""
    try:
      score=content.find("div",{"id":"movie-rat-avg"})["content"]
    except:
      score=""
    return score
  #Metodo para obtener la duracion de la pelicula en minutos
  def extract_movie_running_time(self, content):
    running_time=""
    try:
      running_time=content.find("dd",{"itemprop":"duration"}).get_text().split(" ")
      running_time=running_time[0]
    except:
      running_time=""
    return running_time    

  #Metodo para obtener los generos de la pelicula
  def extract_movie_genres(self, content):
    try:
      genres=content.find("dd",{"class":"card-genres"}).get_text().split("\n")
      genres=genres[1].split("|")
      genres=genres[0].split("  ")
      filtered_genres=[]
      for genre in genres:
        if genre !='':
          genre=genre.strip()
          genre=genre.replace(".","")
          filtered_genres.append(genre) 
    except:
      filtered_genres=[]
    # for index,genre in enumerate(filtered_genres):
    #   if "|" in genre :
    #     filtered_genres[index]=genre+filtered_genres[index+1]
    #     filtered_genres.pop(index+1)
    return filtered_genres

  #Metodo para construir el objeto pelicula, llamando a los metodos anteriores
  def extract_data_film(self,movie_url,movie_title):
    page = requests.get(movie_url, headers={'User-Agent': 'Mozilla/5.0'})
    content = BeautifulSoup(page.content)
    movie=[]
    movie.append(self.extract_movie_score(content))      
    movie.append(self.extract_movie_running_time(content))
    movie.append(self.extract_movie_genres(content))
    return movie
  #Metodo para extraer toda la informacion de una lista de peliculas
  def extract(self,movies_list):
    movies_details = []
    error=0
    for movie in movies_list:
      movie_title=movie[0]
      print(f"Filmaffinity. Trying to extract...{len(movies_details)}/{len(movies_list)}")
      try:
        movie_url=self.get_movie_url(movie_title)
        movie.extend(self.extract_data_film(movie_url,movie_title))
      except:
        print(f"Filmaffinity. movie not found - {movie_title}")
        error=error+1
      movies_details.append(movie)
    print(f"Filmaffinity. Total movies extracted: {len(movies_list)}")
    print(f"Filmaffinity. Movies extracted sucessfully: {len(movies_list)-error}")
    print(f"Filmaffinity. Movies that have not been extracted : {error}")
    return movies_details

In [12]:
class TheNumbers():
  url ="https://www.the-numbers.com/movie/budgets/all"
  
  #Metodo para obtener el dataframe con el budget de todas las peliculas
  def get_dataframe(self):
    url=self.url
    next_page=1
    while next_page<6301:
      page = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
      page.encoding="UTF-8"
      if next_page<2:
        print("Extracting data...")
        data=pd.read_html(page.text,encoding="utf-8")[0]
      else:
        data=data.append(pd.read_html(page.text,encoding="utf-8")[0])
      next_page=next_page+100
      url=self.url+"/"+str(next_page)
    return data
    
  #Metodo para añadir el budget a la pelicula
  def extract(self,movies_list):
    movies_details = []
    budget=""
    df=self.get_dataframe()
    error=0
    for movie in movies_list:
      movie_title=movie[0]
      movie_data=df[df["Movie"].str.lower()==movie_title.lower()]
      if(len(movie_data)==0):
        print("The-numbers. Movie not found - ",movie_title)
        error=error+1
        movie.append(budget)
      else:
        budget=movie_data.iloc[0]["ProductionBudget"]
        budget=budget.replace("$","").replace(",","")
        movie.append(budget)
        movies_details.append(movie)
    print(f"The-Numbers. Total movies extracted: {len(movies_list)}")
    print(f"The-Numbers. Movies extracted sucessfully: {len(movies_list)-error}")
    print(f"The-Numbers. Movies that have not been extracted : {error}")
    return movies_details

In [27]:
class DataExtractor():
    # Properties
    bom: BoxOfficeMojo = None
    ff: Filmaffinity = None
    tn: TheNumbers = None
    bom_data_list: list = []
    ff_data_list: list = []
    tn_data_list: list = []
    df: DataFrame = None
    labels: list = ['Movie title', 'Gross total', 'Max. theaters', 'Opening weekend gross', 'Release date', 'Distributor', 'Score', 'Running time[min]', 'Genres', 'Production Budget']
    path: str = './../services/node-red/files/'

    # Constructor to init extractor objects
    def __init__(self, since_year, to_year):
        self.bom = BoxOfficeMojo(since_year=since_year, to_year=to_year)
        self.ff = Filmaffinity()
        self.tn = TheNumbers()
    
    # Extract and save data from Box Office Mojo
    # Required: None
    # Return: list of films
    def extract_bom_data(self) -> list:
        self.bom_data_list = self.bom.extract_data()
        return self.bom_data_list

    # Extract and save data from Filmaffinity
    # Required: movie list from BOM
    # Return: list of films with added cols from FF
    def extract_ff_data(self, movies_list) -> list:
        self.ff_data_list = self.ff.extract(movies_list)
        return self.ff_data_list

    # Extract and save data from The Numebr
    # Required: movie list from BOM or FF
    # Return: list of films with added budget from TN
    def extract_tn_data(self, movies_list) -> list:
        self.tn_data_list = self.tn.extract(movies_list)
        return self.tn_data_list

    # Call extractor methods in order, providing required data to extract and save from all data sources
    # Required: None
    # Return: None
    def orchestrated_extaction(self):
        self.extract_bom_data()
        self.extract_ff_data(self.bom_data_list)
        self.extract_tn_data(self.ff_data_list)
        return self.to_df(self.ff_data_list, self.labels)

    # Transform movies list to a DataFrame
    # Required: None
    # Return: DataFrame with films with especific cols.
    def to_df(self, movies_list, labels) -> DataFrame:
        self.df = pd.DataFrame(movies_list, columns = labels)
        return self.df

    # Export movies list to a CSV file
    # Required: DataFrame from obj setted
    # Return: None
    def export_to_csv_file(self, df):
        print(f'Exporting data... ')
        file_name =  self.path + 'movies_list_' + str(self.bom.since_year) + '-' + str(self.bom.to_year) + '.csv'
        df.to_csv(self.path + file_name, index = 0, columns = df.columns.values.tolist())
        print(f'Data exported at {file_name} \nProcess finalized!') 

## Extraction

In [33]:
ex = DataExtractor(2012,2020)

In [None]:
df = ex.orchestrated_extaction()

Box Mojo Office. Initializing process
Box Mojo Office. Extracting data... Year:2019
Box Mojo Office. Data extracted!
Filmaffinity. Trying to extract...0/152
Filmaffinity. Trying to extract...1/152
Filmaffinity. Trying to extract...2/152
Filmaffinity. Trying to extract...3/152
Filmaffinity. Trying to extract...4/152
Filmaffinity. Trying to extract...5/152
Filmaffinity. Trying to extract...6/152
Filmaffinity. Trying to extract...7/152
Filmaffinity. Trying to extract...8/152
Filmaffinity. Trying to extract...9/152
Filmaffinity. Trying to extract...10/152
Filmaffinity. Trying to extract...11/152
Filmaffinity. Trying to extract...12/152
Filmaffinity. Trying to extract...13/152
Filmaffinity. Trying to extract...14/152
Filmaffinity. Trying to extract...15/152
Filmaffinity. Trying to extract...16/152
Filmaffinity. Trying to extract...17/152
Filmaffinity. Trying to extract...18/152
Filmaffinity. Trying to extract...19/152
Filmaffinity. Trying to extract...20/152
Filmaffinity. Trying to extract.

# Cleaning

In [None]:
df = df.dropna()

In [None]:
df.info()

## Cleaning unused cols

In [None]:
df["Distributor"].value_counts()

In [None]:
df["Distributor"].describe()

In [None]:
df=df.drop(["Distributor"],axis=1)

## Getting dummies

In [None]:
df["Genres"]=df["Genres"].str.replace("'","")
df["Genres"]=df["Genres"].str.lower()

In [None]:
# Use this line IF the values in Column A are type of `string` instead of lists.
df['Genres'] = df['Genres'].str.strip('[]').str.split('\s*,\s*')

df = (
    df['Genres'].explode()
    .str.get_dummies().groupby(level=0).sum().add_prefix('genre_')
)

df = df.drop(columns=["Genres"]).join(df)

## Setting types

In [None]:
df["Running time[min]"] = df["Running time[min]"].astype(int)

# Exporting to csv

In [None]:
df

In [None]:
ex.export_to_csv_file(df)