## Final Project

In [197]:
# all imports
import pandas as pd                     # data frames
import numpy as np                      # ndarrays
import requests as rq                   # http requests
from bs4 import BeautifulSoup as soup   # beautiful soup for parsing
import regex as re                      # regex operations
from plotnine import *                  # descriptive plotting
from datetime import datetime as dt     # manipulating datetime types

## Get Data

In [198]:
# this requests the data on Movie Production Companies sorted by Worldwide Box Office, high->low
num_url = 'https://www.the-numbers.com/movies/production-companies/#production_companies_overview=od3'
num_rq = rq.get(num_url)
num_soup = soup(num_rq.text, 'lxml') # use 'lxml' instead of 'html.parser' -> problems avoided

ls = []                           # for storing entries and will be converted to data frame
body = num_soup.find('tbody')     # find beginning of useful data
trs = body.findAll('tr')          # extract all html <tbody> row entries 
for entry in trs:                 # each row 
    a = entry.find('a')['href']   # find link of each production company -> used to get all movies later
    a = a[26:]                    # remove excessive URL information -> hard code is ugly, but more concise
    x = entry.text.split('\n')    # get all entry data -> company name, movies made, US Boxoffice, Worldwide
    x = x[1:len(x)-1]             # trim data of two empty entries
    r = [a]                       # it has to be like this... even the separate rows
    r.extend(x)
    ls.append(r)                  # try to clearn this before final submission

numbers = pd.DataFrame(ls, columns=['Link','Company','No.of Movies','Domestic','Worldwide']) # create data frame

# Clean Data :
# - use regex to convert Number's format to int
# - compute International Box Office by subtracting Worldwide from Domestic 
numbers[numbers.columns[3:]] = numbers[numbers.columns[3:]].replace(r'[$,*]', '', regex=True).astype(int)
numbers['International'] = numbers['Worldwide'] - numbers['Domestic'] # compute US-International Box Office

## Minor EDA

- note: our data isn't too accurate yet, and does not really answer the main question
- world wide vs. domestic

## Questions for expansion
what are a few graphs I could explore with a group to
- generate interest
- indicate that more search is needed and possible


how accurate is this data? 

- when does $0 Worldwide become common? 
- near the bottom, we see that most companies have no profit. 
- inspect individual companies
- lots of missing data, but consistant sales reported
- why? not many production companies make their budgets public
- lets use the links from earlier to build a more accurate data frame

A note on film budgets
- the buget information that we really want is spare data, and not easily found
- I mean data on how much the props department was paid, how much the director was paid, the marketing staff, electricians, extra, each main actor, the producers
- this data could be much more insightful, allowing a breakdown of directors' success rates

## Retrieve more accurate data

After determining how to limit our requests (~14k is too much and irrelevant),
for every company considered, we will request the data from their individual file.
That individual file contains information on every film for each production company.

This is the data we want. There is budget information, date information (for normalizing) and we can drop
incomplete rows from consideration. Overall, this is an improvement in accuracy (perhaps measured later?)
- measure calculated sum which drops incomplete entries against the given sums on the main page


## Questions for expansion
what are a few graphs I could explore with a group to
- generate interest
- indicate that more search is needed and possible

how accurate is this data? 
- when does $0 Worldwide become common? 
- near the bottom, we see that most companies have no profit. 
- inspect individual companies
- lots of missing data, but consistant sales reported
- why? not many production companies make their budgets public
- lets use the links from earlier to build a more accurate data frame

A note on film budgets
- the buget information that we really want is spare data, and not easily found
- I mean data on how much the props department was paid, how much the director was paid, the marketing staff, electricians, extra, each main actor, the producers
- this data could be much more insightful, allowing a breakdown of directors' success rates

In [200]:
# this base URL is a route for each company
base_url = "https://www.the-numbers.com/movies/production-company"
all_entries = []
top1000 = numbers.head(1000)

# incoming data needs to be properly treated with regex -> returns list (empty or ready for entry)
# - separate Date and Film Title
# - separete budget, opening weekend, domestic, worldwide
# - regect all entries of inappropriate size

def treat_data(entry):
    ret = []
    t = entry.text
    x = re.split(r'(\w+\s\d+,\s\d{4})',t)[1:]
    if x :                                         # IF date is found
        date = dt.strptime(x[0],'%b %d, %Y').date()   # extract date with regex
        info = re.split(r'\n',x[1:][0])               
        title = info[0]                               # extract title
        money = re.split(r'\$',info[1])[1:]           # extract all monetary information
        if len(money) == 4:                           # IF all monetary information is accounted
            money = [re.sub(r',','',m) for m in money]
            ret.append(date)                            # add date
            ret.append(title)                           # add title
            for m in money: ret.append(m)               # add all money
    return ret                                     # return list, empty or accurate

t1 = dt.now()
for link,comp in zip(top1000.Link, top1000.Company): # for each entry in data frame
    full_url = base_url + link        # consruct full link using base
    r = rq.get(full_url)        # request full link
    if r.status_code == 200:    # IF status return is good
        SOUP = soup(r.text, 'lxml') # make soup
        body = SOUP.find('tbody')         # find beginning of useful data
        trs = body.findAll('tr')          # extract all <tr> html tags 
        for entry in trs:                 # each each row entry 
            x = treat_data(entry)            # treat data, returning [] if data is insufficient
            if x != []:                      # if data is good
                all_entries.append([comp]+x) # add company specific entry to total entries
    else: bad_status += 1
                
df = pd.DataFrame(all_entries, columns=['Company','Release Date','Film Title',     # film info
                                        'Budget','Opening','Domestic','Worldwide'] # monetary info
                 )
df[df.columns[3:]] = df[df.columns[3:]].astype(int)     # type cast monetary info as int
df['International'] = df['Worldwide'] - df['Domestic']  # compute International box office

TypeError: can only concatenate str (not "int") to str

In [186]:
# GET DATA SOURCE

# omdb    - online movie database on individual films
# numbers - monetary production data on production companies and films

# end with two dataframes
# - omdb .... hmmm
# - numbers main page sorted by highest worldwide

# EXPAND ON CURRENT DATA
# omdb
# - for each production company on the main page
#      + save the company name
#      + http request the associated link
#      + create a data frame for all the movies with their budgets
#      + append this data to the master dataframe where all movies with their budgets and production companies are listed

# - rename to Domestic to US-Domestic
# - create new column for US-International = (WorldWide - US)
# - create column for Profit Margin = 