In [1]:
import os
import re
%run Request_utils.ipynb
import pandas as pd
import collections

In [2]:
MAIN_PAGE_LINK = "https://www.boxofficemojo.com/"
BASIC_PATH = "https://www.boxofficemojo.com"
SEARCH_QUERY = "search/?q="
DIV_SEARCH_CLASS_NAME= "a-fixed-left-grid-col a-col-right"
DIV_DESCRIPTION_CLASS = "a-fixed-left-grid-col a-col-right"
SPAN_DESCRIPTAION_CLASS = "a-size-medium"
DIV_TABLE_DETAILS_CLASS = "a-section a-spacing-none mojo-summary-values mojo-hidden-from-mobile"
DIV_MIDDLE_BAR_TITLE = "mojo-link-bar-internal mojo-flex mojo-flex-h"
#DIV_PREFORMANCE_CLASS = "a-section mojo-h-scroll"
DIV_PREFORMANCE_CLASS = "a-section a-spacing-none mojo-h-scroll releases-by-region-section"
CREW_TABLE_ID = "principalCrew"
CAST_TABLE_ID = "principalCast"
DIV_ORIGINAL_RELEASE_DROP_CLASS = "a-section a-spacing-none mojo-dropdown-clear"
TABLE_ROW_TITLE_EARLIESt_RELEASE_DATE = "Earliest Release Date"
TABLE_ROW_TITLE_MPAA = "MPAA"
TABLE_ROW_TITLE_RUNNING_TIME = "Running Time"
TABLE_ROW_TITLE_GENRES = "Genres"
TABLE_ROW_TITLE_BUDGET = "Budget"
MIDDLE_BAR_PERFORMANCE_VALUE = "?ref_=bo_tt_tab"
MIDDLE_BAR_CAST_AND_CREW_VALUE = "credits/?ref_=bo_tt_tab"
COLUMNS = ['movie_link','movie_name','year','description','genres','running_time','earliest_release_date','mpaa','budget','preformence','num_of_countries','cast'
                                         ,'director_names','director_sum','writer_names','writer_sum'
           ,'producer_names','producer_sum','composer_names','composer_sum','cinematographer_names','cinematographer_sum',
           'editor_names','editor_sum','production_designer_names','production_designer_sum','domestic_income','domestic_percent','international_income','international_percent','worldwide_income']

scraping_row_number = 0

In [7]:
def search_movie(name,year):
    name = name.replace(" ","+")
    url = MAIN_PAGE_LINK + SEARCH_QUERY + name
    webpage = get_url_page(url)
    soup = load_soup_object(webpage)
    search_results__list = soup.find_all("div", {"class": DIV_SEARCH_CLASS_NAME})
    for result in search_results__list:
        span = result.find('span').text
        link = result.find('a')
        low_year = int(year)-1
        if(str(year) in span or str(low_year) in span):
            return (BASIC_PATH + link.get('href'))
    return None
    

In [8]:
def get_movie_description(soup_object):
    div = soup_object.find("div", {"class": DIV_DESCRIPTION_CLASS})
    return div.find('span',{"class":SPAN_DESCRIPTAION_CLASS}).text

def get_relese_date(soup_object):
    temp_date = get_value_from_movie_table(soup_object,TABLE_ROW_TITLE_EARLIESt_RELEASE_DATE)
    return get_date_from_string(temp_date)

def get_mpaa(soup_object):
    return get_value_from_movie_table(soup_object,TABLE_ROW_TITLE_MPAA)

def get_budget(soup_object):
    return get_value_from_movie_table(soup_object,TABLE_ROW_TITLE_BUDGET)

def get_running_time(soup_object):
    temp_time = get_value_from_movie_table(soup_object,TABLE_ROW_TITLE_RUNNING_TIME)
    return convert_string_to_minutes(temp_time)

def get_genres(soup_object):
    ganres = get_value_from_movie_table(soup_object,TABLE_ROW_TITLE_GENRES)
    if ganres is None:
        return None
    return ganres.split()


def get_value_from_movie_table(soup_object,row_name):
    div = soup_object.find("div",{"class":DIV_TABLE_DETAILS_CLASS})
    table_rows = div.find_all('div')
    for row in table_rows:
        span_list = row.find_all('span')
        if(span_list[0].text == row_name):
            return span_list[1].text
    return None


    


In [9]:
def get_preformance_table(soup_object):
    link = get_original_release_page_link(soup_object)
    if link is None:
        return None
    webpage = get_url_page(link)
    new_soup_obj = load_soup_object(webpage)
    div = new_soup_obj.find("div",{"class",DIV_PREFORMANCE_CLASS})
    tr_list = div.find_all("tr")
    data = list()
    for tr in tr_list:
        item = list()
        td_list = tr.find_all("td")
        if(td_list):
            for td in td_list:
                item.append(td.text)
            data.append(item)
    return data

def get_original_release_page_link(soup_object):
    div = soup_object.find('div',{"class": DIV_ORIGINAL_RELEASE_DROP_CLASS})
    if div is None:
        return None
    options = div.find_all('option')
    for option in options:
        if option.text == "Original Release":
            return (BASIC_PATH + option.get('value'))   

In [10]:
def get_crew_table(soup_object):
    link = get_middle_bar_links(soup_object,MIDDLE_BAR_CAST_AND_CREW_VALUE)
    if(link is None):
        return None
    webpage = get_url_page(link)
    new_soup_obj = load_soup_object(webpage)
    table = new_soup_obj.find("table",{"id": CREW_TABLE_ID})
    if(table is None):
        return None
    tr_list = table.find_all("tr")
    crew_list = list()
    flag = False
    for tr in tr_list:
        if flag == False:
            flag = True
            continue
        td_list = tr.find_all("td")
        name = remove_empty_lines(td_list[0].find('a').text)
        role = td_list[1].text
        crew_list.append([name[0],role])
    return crew_list

def get_cast_table(soup_object):
    link = get_middle_bar_links(soup_object,MIDDLE_BAR_CAST_AND_CREW_VALUE)
    if(link is None):
        return None
    webpage = get_url_page(link)
    new_soup_obj = load_soup_object(webpage)
    table = new_soup_obj.find("table",{"id": CAST_TABLE_ID})
    if table is None:
        return None
    tr_list = table.find_all("tr")
    cast_list = list()
    flag = False
    for tr in tr_list:
        if flag == False:
            flag = True
            continue
        td_list = tr.find_all("td")
        name = remove_empty_lines(td_list[0].find('a').text)
        role = get_text_value_helper(td_list[1].find("div",{"class":"a-expander-content a-expander-partial-collapse-content"}))
        cast_list.append([name[0],role])
    return cast_list

def get_middle_bar_links(soup_object,match_value):
    div = soup_object.find("div",{"class": DIV_MIDDLE_BAR_TITLE})
    a_list = div.find_all('a')
    for a in a_list:
        if match_value in a.get("href"):
            return BASIC_PATH + a.get("href")
    return None

def get_crew(soup_object):
    crew_list = get_crew_table(soup_object)
    if crew_list is None:
        return None
    crew = collections.defaultdict(list)
    for item in crew_list:
        crew[item[1]].append(item[0])
    return crew

def get_cast_names(soup_object):
    cast_list = get_cast_table(soup_object)
    if cast_list is None:
        return None
    names = list()
    for item in cast_list:
        names.append(item[0])
    return names

In [11]:

def get_performance_summary_table(soup_obj):
    link = get_original_release_page_link(soup_obj)
    if link is None:
        return None
    webpage = get_url_page(link) 
    new_soup_obj = load_soup_object(webpage)
    div = new_soup_obj.find("div",{"class":"a-section a-spacing-none mojo-performance-summary-table"})
    div_2 = div.find_all("div",{"class":"a-section a-spacing-none"})
    table_data = {}
    for item in div_2:
        lable = remove_non_string_characters(get_text_value_helper(item.find("span",{"class":"a-size-small"})))
        money = get_text_value_helper(item.find("span",{"class":"money"}))
        percent = get_text_value_helper(item.find("span",{"class":"percent"}))
        table_data[lable] = money
        table_data[(lable + "_percent")] = percent
    return table_data 

In [12]:
def scrape_movie_details(movie_name,year,movie_link):
    webpage = get_url_page(movie_link)
    new_soup_obj = load_soup_object(webpage)
    description = get_movie_description(new_soup_obj)
    genres = get_genres(new_soup_obj)
    running_time = get_running_time(new_soup_obj)
    earliest_release_date = get_relese_date(new_soup_obj)
    mpaa = get_mpaa(new_soup_obj)
    budget = get_budget(new_soup_obj)
    cast = get_cast_names(new_soup_obj)
    crew = get_crew(new_soup_obj)
    if crew is not None:
        director_names = crew['Director']
        director_sum = len(crew['Director'])
        writer_names = crew['Writer']
        writer_sum = len(crew['Writer'])
        producer_names = crew['Producer']
        producer_sum = len(crew['Producer'])
        composer_names = crew['Composer']
        composer_sum = len(crew['Composer'])
        cinematographer_names =crew['Cinematographer'] 
        cinematographer_sum = len(crew['Cinematographer'])
        editor_names = crew['Editor']
        editor_sum = len(crew['Editor'])
        production_designer_names = crew['Production Designer']
        production_designer_sum = len(crew['Production Designer'])
    else:
        director_names = director_sum = writer_names = writer_sum = producer_names = producer_sum = composer_names = composer_sum = cinematographer_names = cinematographer_sum = editor_names = editor_sum = production_designer_names = production_designer_sum = None
    preformence = get_preformance_table(new_soup_obj)
    if preformence is not None:
        num_of_countries = len(preformence)
    else:
        num_of_countries = None
    summary_table = get_performance_summary_table(new_soup_obj)
    if summary_table is not None:
        domestic_income = summary_table["Domestic"]
        domestic_percent = summary_table["Domestic_percent"]
        international_income  = summary_table["International"]
        international_percent  = summary_table["International_percent"]
        worldwide_income = summary_table["Worldwide"]
    else:
        domestic_income = domestic_percent  = international_income = international_percent = worldwide_income = None
    
    return [movie_link,movie_name,year,description,genres,running_time,
            earliest_release_date,mpaa,budget,preformence,num_of_countries,cast,director_names,director_sum,writer_names,writer_sum
           ,producer_names,producer_sum,composer_names,composer_sum,cinematographer_names,cinematographer_sum,
           editor_names,editor_sum,production_designer_names,production_designer_sum,
            domestic_income,domestic_percent,international_income,international_percent,worldwide_income]

In [36]:
def scrape_2007_2021_movies():
    df = pd.read_csv('data/wildaboutmovies_movies.csv')
    table = list()
    try:
        for index, row in df.iterrows():
            if(index < 0):
                continue
            path = search_movie(row["movie_name"],row["year"])
            if(path is not None):
                row = scrape_movie_details(row["movie_name"],row["year"],path)
                table.append(row)
            else:
                table.append([path,row["movie_name"],row["year"]]) 
        return convert_movie_list_to_df(table)
    except  Exception as e:
        print(e)
        return convert_movie_list_to_df(table)
    

In [1]:
def main_function():
    df = scrape_2007_2021_movies()
    csv_path = df.to_csv("./data/All_movies_table.csv")

In [6]:
#string function utils
from dateutil import parser
def get_date_from_string(test_str):
    if test_str is None:
        return None
    test_str = re.sub("[\(\[].*?[\)\]]", "", test_str)
    res = parser.parse(test_str, fuzzy=True)
    return res
def convert_string_to_minutes(str_text):
    if str_text is None:
        return None
    split_string = str_text.split()
    if(len(split_string) < 3):
        minutes = (int(split_string[0])*60)
    else:
        minutes = (int(split_string[0])*60) + int(split_string[2])
    return minutes

def remove_empty_lines(string):
    if string is None:
        return None
    lines = string.split("\n")
    non_empty_lines = [line for line in lines if line.strip() != ""]
    return non_empty_lines
def get_text_value_helper(soup_obj):
    if soup_obj is None:
        return None
    else:
        return  (soup_obj.text).replace("\n","").replace(" ","")

def remove_non_string_characters(string):
    return ''.join([i for i in string if i.isalpha()])
def convert_movie_list_to_df(data):
    return pd.DataFrame (data, columns = COLUMNS)
