# Scraping actors awards

In [None]:
import re
import ast
import sys
import time
import requests
import traceback
import collections
import pandas as pd
import numpy as np
import os.path
import urllib.request
import concurrent.futures
from os import listdir
from stem import Signal
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import isfile, join
from stem.control import Controller

old_stdout = sys.stdout

log_file = open("actor_scrape.log","w")

sys.stdout = log_file

print("Initial test log")

pickles_save_dir = "./pickles"

In [None]:
def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")
        controller.signal(Signal.NEWNYM)

In [None]:
movies = pd.read_pickle("./pickles/complete_movies.pkl")
len(movies)

In [None]:
stars_nonempty = movies[movies.astype(str)['stars'] != '[]']
stars = stars_nonempty.explode('stars')[['stars']]
stars.drop_duplicates(subset='stars', keep='first', inplace=True)
stars.reset_index(drop=True, inplace=True)
stars.rename(columns={"stars": "nconst"}, inplace=True)
display(stars)

In [None]:
actors = stars

In [None]:
def get_data(session, url):
    nconst=url.rsplit('/', 2)[-2]
    result = []
    
    req = session.get(url)
    req.raise_for_status()    

    body=req.text

    soup=BeautifulSoup(body,'html.parser')
    awards=soup.find_all('tr')
    if awards is not None:
        year_buf = []
        w_n_buf = []
        category_buf = []
        for award in awards:
            award_record = {
                'nconst': nconst,
                'year': None,
                'category': None,
                'w_n': None,
                'description': None,
                'movie': None,
                'tconst': None
            } 
            if award.find('td', class_='award_year') is not None:
                award_year_td = award.find('td', class_='award_year')
                if award_year_td.find('a') is not None:

                    year = award.find('td', class_='award_year').find('a').text
                    if year is not None:
                        try:
                            year = int(year.replace("\n", "").strip())
                            if year:
                                award_record['year'] = year

                            # If the td has a row span more than 1, cache the value so it will be used 
                            # with the subsequent corresponding trs as well
                            if award_year_td['rowspan'] is not None:
                                award_year_td_rs = int(award_year_td['rowspan'])
                                for i in range(0, award_year_td_rs - 1):
                                    year_buf.append(year)
                        except:
                            print("Failed to parse int year {0}".format(year))

            elif len(year_buf) > 0:
                buffed_year = year_buf.pop()
                if buffed_year:
                    award_record['year'] = buffed_year

            if award.find('td',class_='award_outcome') is not None:
                award_outcome_td = award.find('td',class_='award_outcome')
                if award_outcome_td.find('span',class_='award_category') is not None:
                    award_cat = award_outcome_td.find('span',class_='award_category').text
                    if award_cat:
                        award_record['category'] = award_cat

                if award_outcome_td.find('b') is not None:
                    w_n_txt = award_outcome_td.find('b').text
                    if w_n_txt is not None:
                        w_n = w_n_txt.replace("\n", "").strip()
                        if w_n:
                            award_record['w_n'] = w_n

                if award_outcome_td['rowspan'] is not None:
                    award_outcome_td_rs = int(award_outcome_td['rowspan'])
                    for i in range(0, award_outcome_td_rs - 1):
                        category_buf.append(award_cat)
                        w_n_buf.append(w_n)
            else:
                if len(w_n_buf) > 0:
                    buffed_w_n = w_n_buf.pop()
                    if buffed_w_n:
                        award_record['w_n'] = buffed_w_n
                if len(category_buf) > 0:
                    buffed_category = category_buf.pop()
                    if buffed_category:
                        award_record['category'] = buffed_category


            if award.find('td', class_='award_description') is not None:
                award_txt = award.find('td',class_='award_description').find(text=True, recursive=False)
                award_info = award.find('td',class_='award_description').find('a', href=re.compile(r'.*tt\d{7,8}.*'))
                if award_txt is not None:
                    desc_txt = award_txt.replace("\n", "").strip()
                    if desc_txt:
                        award_record['description'] = desc_txt

                if award_info is not None:
                    tconst_res = re.search('tt\d{7,8}', award_info.get('href'))
                    if tconst_res is not None:
                        tconst_val = tconst_res.group(0)
                        if tconst_val:
                            award_record['tconst'] = tconst_val
                        award_info_txt = award_info.text
                        if award_info_txt:
                            award_record['movie'] = award_info.text
            result.append(award_record)

    return result

In [None]:
# Set the start index and end index. Ip is renewed after each chunk of urls of 'step_size'. 
# After every chunk, result is written to a pickle. So if you need to stop the execution in the middle, note the
# index range of the last successfully written pickle file (from the printed logs) and use the remaining range
# for start and end index to resume from where you stopped.
postfix = 'stars'
s_idx = 0
e_idx = len(actors)
step_size = 4000

print("Starting scraping...")

base_url = 'https://www.imdb.com/name/{0}/awards'
urls=[]
for index, row in actors[s_idx:e_idx].iterrows():
    urls.append(base_url.format(row['nconst']))

used_ips = []
failed_urls = []
dropped_urls = []

retry_counts = collections.defaultdict(int)


with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    session = get_tor_session()
    chunk_indeces = np.arange(0, len(urls), step_size)
    
    for idx, start in enumerate(chunk_indeces, start=1):
        start_t = time.time()
        finals = []
        renew_connection()
        session = get_tor_session()
        new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            session = get_tor_session()
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)
        
        url_chunk = urls[start:start + step_size] + failed_urls
        failed_urls = []
        
        print("Submitting {0} URLs to the executor".format(len(url_chunk)))
        
        future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                finals += data
            except Exception as exc:
                stripped = url.strip("'")
                # Retry only once
                if retry_counts[stripped] == 0:
                    failed_urls.append(stripped)
                    retry_counts[stripped] = 1
                    print('%s generated an exception: %s. Added to retry list.' % (url, exc))
                else:
                    print('%s generated an exception: %s. Dropping since retry failed.' % (url, exc))
                    dropped_urls.append(stripped)

        
        concurrent.futures.wait(
            list(future_to_url.keys()), 
            timeout=None, 
            return_when=concurrent.futures.ALL_COMPLETED)
        end_t = time.time()
        print("chunk {0} of {1} completed with IP {2}. Took {3} seconds."\
              .format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
        actors_df = pd.DataFrame(data=finals)
        acror_pickle_save_dir = './pickles/actor'
        if not os.path.exists(acror_pickle_save_dir):
            os.makedirs(acror_pickle_save_dir)
        pickle_path = "{4}/{0}_{1}-{2}_{3}.pkl"\
                        .format(idx, start, start + step_size, postfix, acror_pickle_save_dir)
        actors_df.to_pickle(pickle_path)
        print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))
        
    if len(failed_urls) > 0:
        print('\n\n\n\n!!!**********************************!!!')
        print('Failed to retrieve the following URLs.\n')
        print(failed_urls)
        print('\n!!!**********************************!!!')
        print('Note: Run the following cell to retry the failed urls')
    else:
        print("\n\n\n\n!!!**********************************!!!\n")
        print("Scraping succeeded!")
        print('\n!!!**********************************!!!')
        print('''Note: Next, run the script that combine all pickles''')

In [None]:
'''
Run the following script only if there were any failed URLS at the end of the previous script execution.
Run this several times if you still see failures until there are none. But if some dont succeed even 
after multiple attempts, its probably a problem on imdb side and we can just ignore them.
'''
urls = failed_urls
if len(urls) > 0:
    retry_failed_urls = []
with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
    session = get_tor_session(None)
    chunk_indeces = np.arange(0, len(urls), step_size)

    for idx, start in enumerate(chunk_indeces, start=1):
        start_t = time.time()
        finals = []
        renew_connection()
        session = get_tor_session(session)
        new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            session = get_tor_session(session)
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)

        url_chunk = urls[start:start + step_size] + retry_failed_urls
        retry_failed_urls = []

        future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                finals += data
            except Exception as exc:
                retry_failed_urls.append(url.strip("'"))
                print('%s generated an exception: %s. Added to retry list.' % (url, exc))

        concurrent.futures.wait(list(future_to_url.keys()), timeout=None, return_when=concurrent.futures.ALL_COMPLETED)
        end_t = time.time()
        print("chunk {0} of {1} completed with IP {2}. Took {3} seconds.".format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
        actors_df = pd.DataFrame(data=finals)
        if len(actors_df) > 0:
            pickle_path = "./pickles/actor/{0}_{1}-{2}_{3}_retry.pkl".format(idx, start, start + step_size, postfix)
            actors_df.to_pickle(pickle_path)
            print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))

    if len(retry_failed_urls) > 0:
        print('\n\n\n\n!!!**********************************!!!')
        print('Retry failed with the following URLs.\n')
        print(retry_failed_urls)
        print('\n!!!**********************************!!!')
        print('Note: Run this script several times to see if all succeed.',
        'If some dont, it probably means they wont ever. so just give them up.')
    else:
        print("\n\n\n\n!!!**********************************!!!\n")
        print("Retry succeeded!")
        print('\n!!!**********************************!!!')
        print('''Note: Run the following script to combine all pickles''')

In [None]:
# Combine the created pickles into one Dataframe

dir_path = "./pickles/actor/"
# Ignore any sytem files starting with . or folders if any
pickles = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and not f.startswith('.')]

dfs = list(map(lambda x: pd.read_pickle("{0}/{1}".format(dir_path,x)), pickles))
combined_df = pd.concat(dfs)
combined_df.reset_index(drop=True, inplace=True)
combined_df.to_pickle("./pickles/complete_actors.pkl".format(e_idx))
len(combined_df)

# Movie cast scraping

In [None]:
import re
import ast
import sys
import time
import requests
import traceback
import pandas as pd
import numpy as np
import os.path
import urllib.request
import concurrent.futures
from os import listdir
from stem import Signal
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import isfile, join
from stem.control import Controller

pickles_save_dir = "./pickles"

In [None]:
def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")
        controller.signal(Signal.NEWNYM)

In [None]:
if os.path.exists('./pickles/filtered_movies.pkl'):
    movies = pd.read_pickle("./pickles/filtered_movies.pkl")
else:
    title_basics = pd.read_csv("data/title.basics.tsv.gz", sep='\t')
    movies = title_basics[title_basics.titleType == 'movie']
    if not os.path.exists(pickles_save_dir):
        os.makedirs(pickles_save_dir)
    movies.to_pickle("{0}/filtered_movies.pkl".format(pickles_save_dir))
movies.head()

In [None]:
def get_data(session, url):
    info=[]
    
    r = session.get(url)
    r.raise_for_status()
    
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #tconst
    tconst=url.rsplit('/', 2)[-2]
    
    scraped_data = {
        "tconst": tconst,
        "cast": []
    }
    
    table = soup.find('table', class_='cast_list')
    if table is not None:
        atags = table.find_all('a')
        if atags is not None:
            for atag in atags:
                nconst_res = re.search('nm\d{7,8}', atag.get('href'))
                if nconst_res is not None:
                    scraped_data['cast'].append(nconst_res.group(0))
            
    # Drop duplicates
    scraped_data['cast'] = list(dict.fromkeys(scraped_data['cast']))
    scraped_data['count'] = len(scraped_data['cast'])
    return scraped_data

In [None]:
# Set the start index and end index. Ip is renewed after each chunk of urls of 'step_size'. 
# After every chunk, result is written to a pickle. So if you need to stop the execution in the middle, note the
# index range of the last successfully written pickle file (from the printed logs) and use the remaining range
# for start and end index to resume from where you stopped.
postfix = 'tor'
s_idx = 0
e_idx = len(movies)
step_size = 4000

base_url = 'https://www.imdb.com/title/{0}/fullcredits'
urls=[]
for index, row in movies[s_idx:e_idx].iterrows():
    urls.append(base_url.format(row['tconst']))

used_ips = []
failed_urls = []

with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    session = get_tor_session()
    chunk_indeces = np.arange(0, len(urls), step_size)
    
    for idx, start in enumerate(chunk_indeces, start=1):
        start_t = time.time()
        finals = []
        renew_connection()
        session = get_tor_session()
        new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            session = get_tor_session()
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)
        
        url_chunk = urls[start:start + step_size] + failed_urls
        failed_urls = []
        
        future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                finals.append(data)
            except Exception as exc:
                failed_urls.append(url.strip("'"))
                print('%s generated an exception: %s. Added to retry list.' % (url, exc))
                exc_info = sys.exc_info()
                traceback.print_exception(*exc_info)
        
        concurrent.futures.wait(
            list(future_to_url.keys()), 
            timeout=None, 
            return_when=concurrent.futures.ALL_COMPLETED)
        end_t = time.time()
        print("chunk {0} of {1} completed with IP {2}. Took {3} seconds."\
              .format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
        cast_df = pd.DataFrame(data=finals)
        cast_pickles_savedir = "{0}/castretries/nullrecheck".format(pickles_save_dir)
        if not os.path.exists(cast_pickles_savedir):
            os.makedirs(cast_pickles_savedir)
        pickle_path = "{4}/{0}_{1}-{2}_{3}.pkl".format(idx, start, start + step_size, postfix, cast_pickles_savedir)
        cast_df.to_pickle(pickle_path)
        print("Wrote pickle {0} with {1} rows".format(pickle_path, len(cast_df)))
                
if len(failed_urls) > 0:
    print('\n\n\n\n!!!**********************************!!!')
    print('Failed to retrieve the following URLs.\n')
    print(failed_urls)
    print('\n!!!**********************************!!!')
    print('Note: Run the following cell to retry the failed urls')
else:
    print("\n\n\n\n!!!**********************************!!!\n")
    print("Scraping succeeded!")
    print('\n!!!**********************************!!!')
    print('''Note: Next, run the script that combine all pickles''')

In [None]:
'''
Run the following script only if there were any failed URLS at the end of the previous script execution.
Run this several times if you still see failures until there are none. But if some dont succeed even 
after multiple attempts, its probably a problem on imdb side and we can just ignore them.
'''
urls = failed_urls

if len(urls) > 0:
    retry_failed_urls = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
        session = get_tor_session()
        chunk_indeces = np.arange(0, len(urls), step_size)

        for idx, start in enumerate(chunk_indeces, start=1):
            start_t = time.time()
            finals = []
            renew_connection()
            session = get_tor_session()
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
            while new_ip in used_ips:
                print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
                time.sleep(5)
                renew_connection()
                session = get_tor_session()
                new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
            used_ips.append(new_ip)

            url_chunk = urls[start:start + step_size] + retry_failed_urls
            retry_failed_urls = []

            future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                    finals.append(data)
                except Exception as exc:
                    retry_failed_urls.append(url.strip("'"))
                    print('%s generated an exception: %s. Added to retry list.' % (url, exc))

            concurrent.futures.wait(list(future_to_url.keys()), timeout=None, return_when=concurrent.futures.ALL_COMPLETED)
            end_t = time.time()
            print("chunk {0} of {1} completed with IP {2}. Took {3} seconds.".format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
            cast_df = pd.DataFrame(data=finals)
            if len(cast_df) > 0:
                cast_pickles_savedir = "{0}/castretries/nullrecheck".format(pickles_save_dir)
                if not os.path.exists(cast_pickles_savedir):
                    os.makedirs(cast_pickles_savedir)
                pickle_path = "{4}/{0}_{1}-{2}_{3}_retry.pkl".format(idx, start, start + step_size, postfix, cast_pickles_savedir)
                cast_df.to_pickle(pickle_path)
                print("Wrote pickle {0} with {1} rows".format(pickle_path, len(cast_df)))

        if len(retry_failed_urls) > 0:
            print('\n\n\n\n!!!**********************************!!!')
            print('Retry failed with the following URLs.\n')
            print(retry_failed_urls)
            print('\n!!!**********************************!!!')
            print('Note: Run this script several times to see if all succeed.',
            'If some dont, it probably means they wont ever. so just give them up.')
        else:
            print("\n\n\n\n!!!**********************************!!!\n")
            print("Retry succeeded!")
            print('\n!!!**********************************!!!')
            print('''Note: Run the following script to combine all pickles''')

In [None]:
# Combine the created pickles into one Dataframe

dir_path = "{0}/casttor".format(pickles_save_dir)
# Ignore any sytem files starting with . or folders if any
pickles = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and not f.startswith('.')]
dfs = list(map(lambda x: pd.read_pickle("{0}/{1}".format(dir_path,x)), pickles))
combined_df = pd.concat(dfs)
combined_df.reset_index(drop=True, inplace=True)
combined_df.to_pickle("./pickles/complete_cast{0}.pkl".format(e_idx))
len(combined_df)
with pd.option_context('display.max_rows', 50000, 'display.max_columns', 20):
    display(combined_df)


# Movie scraping

In [None]:
import re
import ast
import time
import requests
import pandas as pd
import numpy as np
import os.path
import urllib.request
import concurrent.futures
from os import listdir
from stem import Signal
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import isfile, join
from stem.control import Controller

In [None]:
def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")
        controller.signal(Signal.NEWNYM)

In [None]:
if os.path.exists('./pickles/filtered_movies.pkl'):
    movies = pd.read_pickle("./pickles/filtered_movies.pkl")
else:
    title_basics = pd.read_csv("data/title.basics.tsv", sep='\t')
    movies = title_basics[title_basics.titleType == 'movie']
    movies.to_pickle("./filtered_movies.pkl")
movies.head()

In [None]:
def get_data(session, url):
    scraped_data = {
    "tconst": [],
    "stars": [],
    "oscarWins": [],
    "nominations": [],
    "wins": [],
    "releaseDate": [],
    "releaseCountry": [],
    "plotKeywords": [],
    "budget": [],
    "worldwideGross": [],
    "metascore": [],
    "musicProducer": []
    }
    
    info=[]
    
    try:
        r = session.get(url)
        r.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(err)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #tconst
    tconst=url.rsplit('/', 1)[-1]
    scraped_data['tconst'].append(tconst)
    
        
    # Stars
    stars = []
    stars_h4 = soup.find('h4', string='Stars:')
    if stars_h4 is not None:
        star_atags_parent = stars_h4.parent
        if star_atags_parent is not None:
            star_atags = star_atags_parent.find_all('a')
            if star_atags is not None:
                for atag in star_atags:
                    if atag['href'].startswith('/name/'):
                        stars.append(atag['href'].split('/')[2])
    else:
        stars_h4 = soup.find('h4', string='Star:')
        if stars_h4 is not None:
            star_atags_parent = stars_h4.parent
            if star_atags_parent is not None:
                star_atags = star_atags_parent.find_all('a')
                if star_atags is not None:
                    for atag in star_atags:
                        if atag['href'].startswith('/name/'):
                            stars.append(atag['href'].split('/')[2])
    scraped_data['stars'].append(stars)
    
    
    # Metascore
    metascore = None
    metascore_list = soup.select('.metacriticScore span:first-child')
    if len(metascore_list) > 0:
        metascore = metascore_list[0].string
        
    scraped_data['metascore'].append(metascore if metascore is None else str(metascore))
    
    
    #awards
    awrds_lines = soup.find_all(class_="awards-blurb")
    oscars = 0
    wins = 0
    nominations = 0
    for line in awrds_lines:
        
        if line.findChild() is not None:
            prepped_str = re.sub(' +', ' ', line.findChild().text.replace("\n", " ").strip())
            res = re.search('(W|w)on (\d+) (O|o)scars.?', prepped_str)
            if res is not None:
                oscars = int(res.group(2))
            
        else:
            prepped_str = re.sub(' +', ' ', line.text.replace("\n", "").strip())
            
            res = re.search('(\d+) wins', prepped_str)
            if res is not None:
                wins = int(res.group(1))
            
            
            res = re.search('(\d+) nominations', prepped_str)
            if res is not None:
                nominations = int(res.group(1))
    scraped_data['oscarWins'].append(oscars)
    scraped_data['wins'].append(wins)
    scraped_data['nominations'].append(nominations)
    
    
    # Release date
    release_date_h4 = soup.find('h4', string='Release Date:')
    release_date = None
    release_country = None
    if release_date_h4 is not None:
        release_date_raw_text = release_date_h4.parent.findAll(text=True, recursive=False)
        release_date_prepped = re.sub(' +', ' ', ''.join(release_date_raw_text).replace("\n", "").strip())
        date_str_match = re.search(r'\d{1,2} \w+ \d{4}', release_date_prepped)
        if date_str_match is not None:
            release_date = datetime.strptime(date_str_match.group(), '%d %B %Y').date()
        release_country_match = re.search(r'\(([a-zA-Z ]{2,})\)', release_date_prepped)
        if release_country_match is not None and len(release_country_match.groups()) > 0:
            release_country = release_country_match.group(1)
        
    scraped_data['releaseDate'].append(release_date)
    scraped_data['releaseCountry'].append(release_country)
    
    
    # Budget
    budget_h4 = soup.find('h4', string='Budget:')
    budget = None
    if budget_h4 is not None:
        budget_raw_text = budget_h4.parent.findAll(text=True, recursive=False)
        budget = re.sub(' +', ' ', ''.join(budget_raw_text).replace("\n", "").strip())
        
    scraped_data['budget'].append(budget)
    
    
    # worldwide gross
    gross_h4 = soup.find('h4', string='Cumulative Worldwide Gross:')
    gross = None
    if gross_h4 is not None:
        gross_h4_text = gross_h4.parent.findAll(text=True, recursive=False)
        gross = re.sub(' +', ' ', ''.join(gross_h4_text).replace("\n", "").strip())
    
    scraped_data['worldwideGross'].append(gross)
    
    
    # Plot keywords
    keywords_verification_threshold = 2 # Consider only words atleast 2 people considered relavent
    keywords_url = url + "/keywords"
    r = get(keywords_url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    keywords = []
    plot_keywords_items = soup.find_all(class_="soda sodavote")
    if plot_keywords_items is not None:
        for plot_keywords_item in plot_keywords_items:
            validity_text = plot_keywords_item.find(class_='interesting-count-text').a.text.strip()
            validity_text_match = re.search(r'(\d+) of', validity_text)
            if validity_text_match is not None and len(validity_text_match.groups()) > 0:
                if int(validity_text_match.group(1)) >= keywords_verification_threshold:
                    keywords.append(plot_keywords_item.find(class_='sodatext').a.text.strip())
    
    scraped_data['plotKeywords'].append(keywords)
    
    
    # Music producer
    fullcredits_url = url + "/fullcredits"
    r = get(fullcredits_url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    music_producer = None
    
    full_credits_container = soup.find(id='fullcredits_content', class_='header')
    if full_credits_container is not None:
        full_credits = full_credits_container.find_all(recursive=False)
        if full_credits is not None:
            for idx, item in enumerate(full_credits, start=0):
                if 'Music by' in item.text:
                    producer_atag = full_credits[idx + 1].find('a')
                    if producer_atag is not None:
                        producer_href = producer_atag['href']
                        if producer_href is not None:
                            music_producer = producer_href.split('/')[2]
                            break
    
    scraped_data['musicProducer'].append(music_producer)
    return scraped_data

In [None]:
# Set the start index and end index. Ip is renewed after each chunk of urls of 'step_size'. 
# After every chunk, result is written to a pickle. So if you need to stop the execution in the middle, note the
# index range of the last successfully written pickle file (from the printed logs) and use the remaining range
# for start and end index to resume from where you stopped.
postfix = 'tor'
s_idx = 0
e_idx = 178118
step_size = 4000

base_url = 'https://www.imdb.com/title/'
urls=[]
for index, row in movies[s_idx:e_idx].iterrows():
    urls.append(base_url + row['tconst'])

used_ips = []

with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
    session = get_tor_session()
    chunk_indeces = np.arange(0, len(urls), step_size)
    failed_urls = []
    
    for idx, start in enumerate(chunk_indeces, start=1):
        start_t = time.time()
        finals = {
            "tconst": [],
            "stars": [],
            "oscarWins": [],
            "nominations": [],
            "wins": [],
            "releaseDate": [],
            "releaseCountry": [],
            "plotKeywords": [],
            "budget": [],
            "worldwideGross": [],
            "metascore": [],
            "musicProducer": []
        }
        renew_connection()
        session = get_tor_session()
        new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            session = get_tor_session()
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)
        
        url_chunk = urls[start:start + step_size] + failed_urls
        failed_urls = []
        
        future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                for k, v in data.items():
                    finals[k].append(v[0])
            except Exception as exc:
                failed_urls.append(url.strip("'"))
                print('%s generated an exception: %s. Added to retry list.' % (url, exc))
        
        concurrent.futures.wait(list(future_to_url.keys()), timeout=None, return_when=concurrent.futures.ALL_COMPLETED)
        end_t = time.time()
        print("chunk {0} of {1} completed with IP {2}. Took {3} seconds.".format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
        movies_df = pd.DataFrame(data=finals)
        pickle_path = "./pickles/tor/{0}_{1}-{2}_{3}.pkl".format(idx, start, start + step_size, postfix)
        movies_df.to_pickle(pickle_path)
        print("Wrote pickle {0} with {1} rows".format(pickle_path, len(movies_df)))

In [None]:
# Combine the created pickles into one Dataframe

dir_path = "./pickles/tor"
# Ignore any sytem files starting with . or folders if any
pickles = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and not f.startswith('.')]
dfs = list(map(lambda x: pd.read_pickle("{0}/{1}".format(dir_path,x)), pickles))
combined_df = pd.concat(dfs)
combined_df.reset_index(drop=True, inplace=True)
combined_df.to_pickle("./pickles/complete_{0}.pkl".format(e_idx))
len(combined_df)