In [4]:
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
import os.path
import re
from datetime import datetime
import time

In [5]:
if os.path.exists('./filtered_movies.pkl'):
    movies = pd.read_pickle("./filtered_movies.pkl")
else:
    title_basics = pd.read_csv("data/title.basics.tsv", sep='\t')
    movies = title_basics[title_basics.titleType == 'movie']
    movies.to_pickle("./filtered_movies.pkl")
movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
145,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,20,"Documentary,News,Sport"
332,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900,\N,\N,"Biography,Drama"
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Biography,Crime,Drama"


In [6]:
len(movies)

534354

In [7]:
scraped_data = {
    "tconst": [],
    "stars": [],
    "oscarWins": [],
    "nominations": [],
    "wins": [],
    "releaseDate": [],
    "releaseCountry": [],
    "plotKeywords": [],
    "budget": [],
    "worldwideGross": [],
    "metascore": [],
    "musicProducer": []
}

base_url = 'https://www.imdb.com/title/'
start = time.time()
for index, row in movies[:100].iterrows():
    print("Processing %d of %s. Id %s" % (index, movies.tail(1).index[0], row['tconst']))
    scraped_data['tconst'].append(row['tconst'])
    url = base_url + row['tconst']
    
    r = get(url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # Stars
    stars = []
    stars_h4 = soup.find('h4', string='Stars:')
    if stars_h4 is not None:
        star_atags_parent = stars_h4.parent
        if star_atags_parent is not None:
            star_atags = star_atags_parent.find_all('a')
            if star_atags is not None:
                for atag in star_atags:
                    if atag['href'].startswith('/name/'):
                        stars.append(atag['href'].split('/')[2])
    scraped_data['stars'].append(stars)
    print('stars', type(stars))
    if len(stars) > 0:
        print('starsitem', type(stars[0]))
    
    # Metascore
    metascore = None
    metascore_list = soup.select('.metacriticScore span:first-child')
    if len(metascore_list) > 0:
        metascore = metascore_list[0].string
        
    scraped_data['metascore'].append(metascore)
    print('metascore', type(metascore))
    
    #awards
    awrds_lines = soup.find_all(class_="awards-blurb")
    oscars = 0
    wins = 0
    nominations = 0
    for line in awrds_lines:
        
        if line.findChild() is not None:
            prepped_str = re.sub(' +', ' ', line.findChild().text.replace("\n", " ").strip())
            res = re.search('(W|w)on (\d+) (O|o)scars.?', prepped_str)
            if res is not None:
                oscars = int(res.group(2))
            
        else:
            prepped_str = re.sub(' +', ' ', line.text.replace("\n", "").strip())
            
            res = re.search('(\d+) wins', prepped_str)
            if res is not None:
                wins = int(res.group(1))
            
            
            res = re.search('(\d+) nominations', prepped_str)
            if res is not None:
                nominations = int(res.group(1))
    scraped_data['oscarWins'].append(oscars)
    scraped_data['wins'].append(wins)
    scraped_data['nominations'].append(nominations)
    print('oscarWins', type(oscars))
    print('wins', type(wins))
    print('nominations', type(nominations))
    
    
    # Release date
    release_date_h4 = soup.find('h4', string='Release Date:')
    release_date = None
    release_country = None
    if release_date_h4 is not None:
        release_date_raw_text = release_date_h4.parent.findAll(text=True, recursive=False)
        release_date_prepped = re.sub(' +', ' ', ''.join(release_date_raw_text).replace("\n", "").strip())
        date_str_match = re.search(r'\d{1,2} \w+ \d{4}', release_date_prepped)
        if date_str_match is not None:
            release_date = datetime.strptime(date_str_match.group(), '%d %B %Y').date()
        release_country_match = re.search(r'\(([a-zA-Z ]{2,})\)', release_date_prepped)
        if release_country_match is not None and len(release_country_match.groups()) > 0:
            release_country = release_country_match.group(1)
        
    scraped_data['releaseDate'].append(release_date)
    scraped_data['releaseCountry'].append(release_country)
    print('releaseDate', type(release_date))
    print('releaseCountry', type(release_country))
    
    # Budget
    budget_h4 = soup.find('h4', string='Budget:')
    budget = None
    if budget_h4 is not None:
        budget_raw_text = budget_h4.parent.findAll(text=True, recursive=False)
        budget = re.sub(' +', ' ', ''.join(budget_raw_text).replace("\n", "").strip())
        
    scraped_data['budget'].append(budget)
    print('budget', type(budget))
    
    # worldwide gross
    gross_h4 = soup.find('h4', string='Cumulative Worldwide Gross:')
    gross = None
    if gross_h4 is not None:
        gross_h4_text = gross_h4.parent.findAll(text=True, recursive=False)
        gross = re.sub(' +', ' ', ''.join(gross_h4_text).replace("\n", "").strip())
    
    scraped_data['worldwideGross'].append(gross)
    print('worldwideGross', type(gross))
    
    # Plot keywords
    keywords_verification_threshold = 2 # Consider only words atleast 2 people considered relavent
    keywords_url = url + "/keywords"
    r = get(keywords_url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    keywords = []
    plot_keywords_items = soup.find_all(class_="soda sodavote")
    if plot_keywords_items is not None:
        for plot_keywords_item in plot_keywords_items:
            validity_text = plot_keywords_item.find(class_='interesting-count-text').a.text.strip()
            validity_text_match = re.search(r'(\d+) of', validity_text)
            if validity_text_match is not None and len(validity_text_match.groups()) > 0:
                if int(validity_text_match.group(1)) >= keywords_verification_threshold:
                    keywords.append(plot_keywords_item.find(class_='sodatext').a.text.strip())
    
    scraped_data['plotKeywords'].append(keywords)
    print('plotKeywords', type(keywords))
    if len(keywords) > 0:
        print('plotKeywordsitem', type(keywords[0]))
    
    # Music producer
    fullcredits_url = url + "/fullcredits"
    r = get(fullcredits_url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    music_producer = None
    
    full_credits_container = soup.find(id='fullcredits_content', class_='header')
    if full_credits_container is not None:
        full_credits = full_credits_container.find_all(recursive=False)
        if full_credits is not None:
            for idx, item in enumerate(full_credits, start=0):
                if 'Music by' in item.text:
                    producer_atag = full_credits[idx + 1].find('a')
                    if producer_atag is not None:
                        producer_href = producer_atag['href']
                        if producer_href is not None:
                            music_producer = producer_href.split('/')[2]
                            break
    
    scraped_data['musicProducer'].append(music_producer)
    print('musicProducer', type(music_producer))
    
end = time.time()

print('Time taken: %f seconds' % (end - start) )

Processing 8 of 6274818. Id tt0000009
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 145 of 6274818. Id tt0000147
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'NoneType'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 332 of 6274818. Id tt0000335
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneTy

plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 877 of 6274818. Id tt0000886
stars <class 'list'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 882 of 6274818. Id tt0000891
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 931 of 6274818. Id tt0000941
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'NoneType'>
releaseCoun

plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1104 of 6274818. Id tt0001115
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1105 of 6274818. Id tt0001116
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1111 of 6274818. Id tt0001122
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <cl

plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1347 of 6274818. Id tt0001358
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1348 of 6274818. Id tt0001359
stars <class 'list'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1353 of 6274818. Id tt0001364
stars <class 'list'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'NoneType'>
releaseCountry <class 'NoneType'

plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1574 of 6274818. Id tt0001587
stars <class 'list'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'NoneType'>
releaseCountry <class 'NoneType'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1579 of 6274818. Id tt0001592
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'datetime.date'>
releaseCountry <class 'str'>
budget <class 'NoneType'>
worldwideGross <class 'NoneType'>
plotKeywords <class 'list'>
musicProducer <class 'NoneType'>
Processing 1589 of 6274818. Id tt0001602
stars <class 'list'>
starsitem <class 'str'>
metascore <class 'NoneType'>
oscarWins <class 'int'>
wins <class 'int'>
nominations <class 'int'>
releaseDate <class 'NoneType'>
releaseC

In [186]:

movies_df = pd.DataFrame(data=scraped_data)
movies_df

Unnamed: 0,tconst,stars,oscarWins,nominations,wins,releaseDate,releaseCountry,plotKeywords,budget,worldwideGross,metascore,musicProducer
0,tt0004525,"[nm0260391, nm0617787, nm0993508]",0,0,0,1914-02-27,France,[],,,,nm2126992
1,tt0004528,[],0,0,0,1914-12-01,USA,[],,,,
2,tt0004532,"[nm0534259, nm0235791, nm0124189]",0,0,0,1914-09-26,USA,[],,,,
3,tt0004535,"[nm0574421, nm0041404, nm0687031]",0,0,0,1914-05-20,USA,[],,,,
4,tt0004537,"[nm0420232, nm0163491, nm0086816]",0,0,0,1914-11-09,USA,[],,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
100,tt0004786,"[nm0159654, nm0498797, nm0231657]",0,0,0,1914-09-18,Netherlands,[],,,,
101,tt0004789,[],0,0,0,,USA,[],,,,
102,tt0004792,"[nm0276578, nm0731939, nm0213049]",0,0,0,1914-10-22,USA,[],"$12,234",,,
103,tt0004794,"[nm0855036, nm0526226, nm0741852]",0,0,0,1914-10-26,USA,[],,,,
