In [1]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os.path
import re
from datetime import datetime
import time

In [2]:
from multiprocessing import Pool

In [3]:
if os.path.exists('./filtered_movies.pkl'):
    movies = pd.read_pickle("./filtered_movies.pkl")
else:
    title_basics = pd.read_csv("data/title.basics.tsv", sep='\t')
    movies = title_basics[title_basics.titleType == 'movie']
    movies.to_pickle("./filtered_movies.pkl")
movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
145,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,20,"Documentary,News,Sport"
332,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900,\N,\N,"Biography,Drama"
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Biography,Crime,Drama"


In [4]:
len(movies)

534354

In [5]:

base_url = 'https://www.imdb.com/title/'
urls=[]
for index, row in movies[:50].iterrows():
    urls.append(base_url + row['tconst'])

In [6]:
def get_data(url):
    scraped_data = {
    "tconst": [],
    "stars": [],
    "oscarWins": [],
    "nominations": [],
    "wins": [],
    "releaseDate": [],
    "releaseCountry": [],
    "plotKeywords": [],
    "budget": [],
    "worldwideGross": [],
    "metascore": [],
    "musicProducer": []
    }
    
    info=[]
    r = get(url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #tconst
    
    tconst=url.rsplit('/', 1)[-1]
    scraped_data['tconst'].append(tconst)
    # Stars
    stars = []
    stars_h4 = soup.find('h4', string='Stars:')
    if stars_h4 is not None:
        star_atags_parent = stars_h4.parent
        if star_atags_parent is not None:
            star_atags = star_atags_parent.find_all('a')
            if star_atags is not None:
                for atag in star_atags:
                    if atag['href'].startswith('/name/'):
                        stars.append(atag['href'].split('/')[2])
    scraped_data['stars'].append(stars)
    
    
    # Metascore
    metascore = None
    metascore_list = soup.select('.metacriticScore span:first-child')
    if len(metascore_list) > 0:
        metascore = metascore_list[0].string
        
    scraped_data['metascore'].append(metascore)
    
    
    #awards
    awrds_lines = soup.find_all(class_="awards-blurb")
    oscars = 0
    wins = 0
    nominations = 0
    for line in awrds_lines:
        
        if line.findChild() is not None:
            prepped_str = re.sub(' +', ' ', line.findChild().text.replace("\n", " ").strip())
            res = re.search('(W|w)on (\d+) (O|o)scars.?', prepped_str)
            if res is not None:
                oscars = int(res.group(2))
            
        else:
            prepped_str = re.sub(' +', ' ', line.text.replace("\n", "").strip())
            
            res = re.search('(\d+) wins', prepped_str)
            if res is not None:
                wins = int(res.group(1))
            
            
            res = re.search('(\d+) nominations', prepped_str)
            if res is not None:
                nominations = int(res.group(1))
    scraped_data['oscarWins'].append(oscars)
    scraped_data['wins'].append(wins)
    scraped_data['nominations'].append(nominations)
    
    
    # Release date
    release_date_h4 = soup.find('h4', string='Release Date:')
    release_date = None
    release_country = None
    if release_date_h4 is not None:
        release_date_raw_text = release_date_h4.parent.findAll(text=True, recursive=False)
        release_date_prepped = re.sub(' +', ' ', ''.join(release_date_raw_text).replace("\n", "").strip())
        date_str_match = re.search(r'\d{1,2} \w+ \d{4}', release_date_prepped)
        if date_str_match is not None:
            release_date = datetime.strptime(date_str_match.group(), '%d %B %Y').date()
        release_country_match = re.search(r'\(([a-zA-Z ]{2,})\)', release_date_prepped)
        if release_country_match is not None and len(release_country_match.groups()) > 0:
            release_country = release_country_match.group(1)
        
    scraped_data['releaseDate'].append(release_date)
    scraped_data['releaseCountry'].append(release_country)
    
    
    # Budget
    budget_h4 = soup.find('h4', string='Budget:')
    budget = None
    if budget_h4 is not None:
        budget_raw_text = budget_h4.parent.findAll(text=True, recursive=False)
        budget = re.sub(' +', ' ', ''.join(budget_raw_text).replace("\n", "").strip())
        
    scraped_data['budget'].append(budget)
    
    
    # worldwide gross
    gross_h4 = soup.find('h4', string='Cumulative Worldwide Gross:')
    gross = None
    if gross_h4 is not None:
        gross_h4_text = gross_h4.parent.findAll(text=True, recursive=False)
        gross = re.sub(' +', ' ', ''.join(gross_h4_text).replace("\n", "").strip())
    
    scraped_data['worldwideGross'].append(gross)
    
    
    # Plot keywords
    keywords_verification_threshold = 2 # Consider only words atleast 2 people considered relavent
    keywords_url = url + "/keywords"
    r = get(keywords_url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    keywords = []
    plot_keywords_items = soup.find_all(class_="soda sodavote")
    if plot_keywords_items is not None:
        for plot_keywords_item in plot_keywords_items:
            validity_text = plot_keywords_item.find(class_='interesting-count-text').a.text.strip()
            validity_text_match = re.search(r'(\d+) of', validity_text)
            if validity_text_match is not None and len(validity_text_match.groups()) > 0:
                if int(validity_text_match.group(1)) >= keywords_verification_threshold:
                    keywords.append(plot_keywords_item.find(class_='sodatext').a.text.strip())
    
    scraped_data['plotKeywords'].append(keywords)
    
    
    # Music producer
    fullcredits_url = url + "/fullcredits"
    r = get(fullcredits_url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    music_producer = None
    
    full_credits_container = soup.find(id='fullcredits_content', class_='header')
    if full_credits_container is not None:
        full_credits = full_credits_container.find_all(recursive=False)
        if full_credits is not None:
            for idx, item in enumerate(full_credits, start=0):
                if 'Music by' in item.text:
                    producer_atag = full_credits[idx + 1].find('a')
                    if producer_atag is not None:
                        producer_href = producer_atag['href']
                        if producer_href is not None:
                            music_producer = producer_href.split('/')[2]
                            break
    
    scraped_data['musicProducer'].append(music_producer)
    return scraped_data


In [None]:
p=Pool(10)
start = time.time()
with Pool(10) as p:
    data=p.map(get_data,urls)
    p.terminate()
    p.join()
end = time.time()
print('Time taken: %f seconds' % (end - start) )

In [8]:
movies_df = pd.DataFrame(data=data)
movies_df

Unnamed: 0,tconst,stars,oscarWins,nominations,wins,releaseDate,releaseCountry,plotKeywords,budget,worldwideGross,metascore,musicProducer
0,[tt0000009],"[[nm0063086, nm0183823, nm1309758]]",[0],[0],[0],[1894-10-09],[USA],[[]],[None],[None],[None],[None]
1,[tt0000147],"[[nm0179163, nm0280615, nm4082222]]",[0],[0],[0],[None],[USA],[[]],[None],[None],[None],[None]
2,[tt0000335],"[[nm1010955, nm1012612, nm1011210]]",[0],[0],[0],[1900-09-13],[Australia],[[]],[None],[None],[None],[None]
3,[tt0000502],"[[nm0215752, nm0252720]]",[0],[0],[0],[None],[None],[[]],[None],[None],[None],[None]
4,[tt0000574],"[[nm0846887, nm0846894, nm3002376]]",[0],[0],[0],[1906-12-26],[Australia],[[]],"[$2,250]",[None],[None],[nm2421834]
5,[tt0000615],"[[nm3071427, nm0581353, nm0888988]]",[0],[0],[0],[1907-11-02],[Australia],[[]],"[AUD1,000]",[None],[None],[None]
6,[tt0000630],[[]],[0],[0],[0],[None],[Italy],[[]],[None],[None],[None],[None]
7,[tt0000675],[[]],[0],[0],[0],[None],[None],[[]],[None],[None],[None],[None]
8,[tt0000676],"[[nm0097421, nm0140054]]",[0],[0],[0],[None],[None],[[]],[None],[None],[None],[None]
9,[tt0000679],"[[nm0000875, nm0122665, nm0933446]]",[0],[0],[0],[1908-09-24],[USA],[[]],[None],[None],[None],[nm0542903]
