In [1]:
import re
import ast
import time
import requests
import pandas as pd
import numpy as np
import os.path
import urllib.request
import concurrent.futures
from os import listdir
from stem import Signal
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import isfile, join
from stem.control import Controller


In [2]:
movie_subset = pd.read_pickle("./pickles/complete_178118.pkl")
movie_subset.head()

Unnamed: 0,tconst,stars,oscarWins,nominations,wins,releaseDate,releaseCountry,plotKeywords,budget,worldwideGross,metascore,musicProducer
0,tt0014799,"[nm0265550, nm0370407, nm0550195]",0,0,0,1924-05-31,UK,[],,,,
1,tt0014843,"[nm0427659, nm0107574, nm0421138]",0,0,0,1924-08-24,USA,[],,,,
2,tt0014809,"[nm0267916, nm0119572, nm0055809]",0,0,0,1924-04-08,USA,[],,,,
3,tt0014751,"[nm0403710, nm0744408]",0,0,0,,,[],,,,
4,tt0014812,"[nm0556953, nm0531962, nm0645941]",0,0,0,1924-12-28,USA,[],,,,


In [3]:
stars_exploded = movie_subset.explode('stars')[['tconst', 'stars']]
stars_exploded.head()

Unnamed: 0,tconst,stars
0,tt0014799,nm0265550
0,tt0014799,nm0370407
0,tt0014799,nm0550195
1,tt0014843,nm0427659
1,tt0014843,nm0107574


In [4]:
stars_exploded.stars.nunique()

138245

In [5]:
stars_counts = stars_exploded\
                    .groupby('stars')\
                    .count()\
                    .rename(columns={"tconst": "count"})\
                    .sort_values(['count'], ascending=False)
stars_counts.index.name = 'nconst'

In [6]:
stars_counts.tail(87400) 

Unnamed: 0_level_0,count
nconst,Unnamed: 1_level_1
nm0323379,2
nm0098627,2
nm0073596,2
nm1054406,1
nm1030940,1
...,...
nm0670816,1
nm0163019,1
nm0670779,1
nm0670771,1


Close to 90000 stars have starred in only 1 movie in the chosen movie subset. For the purpose of bringing down the actors count for scraping,  for the moment, let's consider only the actors that have starred in more than 1 movie in the chosen movie subset. 


In [7]:
print("Jessica Alba: {0} movies".format(stars_counts.loc['nm0004695']['count']))
print("Billy Boyd (Pippin from LOTR): {0} movies".format(stars_counts.loc['nm0101710']['count']))
print("Will Smith: {0} movies".format(stars_counts.loc['nm0000226']['count']))
print("Morgan Freeman: {0} movies".format(stars_counts.loc['nm0000151']['count']))


Jessica Alba: 4 movies
Billy Boyd (Pippin from LOTR): 1 movies
Will Smith: 12 movies
Morgan Freeman: 26 movies


In [50]:
stars_subset = stars_counts[stars_counts['count'] > 1]
len(stars_subset)

50848

### Scraping

In [39]:
def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")
        controller.signal(Signal.NEWNYM)

def get_data(session, url):
    nconst=url.rsplit('/', 2)[-2]
    result = []
    
    
    with session.get(url) as req:
        req.raise_for_status()    
    
        body=req.text

        soup=BeautifulSoup(body,'html.parser')
        awards=soup.find_all('tr')
        if awards is not None:
            year_buf = []
            w_n_buf = []
            category_buf = []
            for award in awards:
                award_record = {
                    'nconst': nconst,
                    'year': None,
                    'category': None,
                    'w_n': None,
                    'description': None,
                    'movie': None,
                    'tconst': None
                } 
                if award.find('td', class_='award_year') is not None:
                    award_year_td = award.find('td', class_='award_year')
                    if award_year_td.find('a') is not None:
                        
                        year = award.find('td', class_='award_year').find('a').text
                        if year is not None:
                            try:
                                year = int(year.replace("\n", "").strip())
                                award_record['year'] = year
                                
                                # If the td has a row span more than 1, cache the value so it will be used 
                                # with the subsequent corresponding trs as well
                                if award_year_td['rowspan'] is not None:
                                    award_year_td_rs = int(award_year_td['rowspan'])
                                    for i in range(0, award_year_td_rs - 1):
                                        year_buf.append(year)
                            except:
                                print("Failed to parse int year {0}".format(year))

                elif len(year_buf) > 0:
                    buffed_year = year_buf.pop()
                    award_record['year'] = buffed_year

                if award.find('td',class_='award_outcome') is not None:
                    award_outcome_td = award.find('td',class_='award_outcome')
                    if award_outcome_td.find('span',class_='award_category') is not None:
                        award_cat = award_outcome_td.find('span',class_='award_category').text
                        award_record['category'] = award_cat
                        
                    if award_outcome_td.find('b') is not None:
                        w_n_txt = award_outcome_td.find('b').text
                        if w_n_txt is not None:
                            w_n = w_n_txt.replace("\n", "").strip()
                            award_record['w_n'] = w_n
                            
                    if award_outcome_td['rowspan'] is not None:
                        award_outcome_td_rs = int(award_outcome_td['rowspan'])
                        for i in range(0, award_outcome_td_rs - 1):
                            category_buf.append(award_cat)
                            w_n_buf.append(w_n)
                else:
                    if len(w_n_buf) > 0:
                        buffed_w_n = w_n_buf.pop()
                        award_record['w_n'] = buffed_w_n
                    if len(category_buf) > 0:
                        buffed_category = category_buf.pop()
                        award_record['category'] = buffed_category
                    

                if award.find('td', class_='award_description') is not None:
                    award_txt = award.find('td',class_='award_description').find(text=True, recursive=False)
                    award_info = award.find('td',class_='award_description').find('a', href=re.compile(r'.*tt\d{7}.*'))
                    if award_txt is not None:
                        award_record['description'] = award_txt.replace("\n", "").strip()

                    if award_info is not None:
                        tconst_res = re.search('tt\d{7}', award_info.get('href'))
                        if tconst_res is not None:
                            award_record['tconst'] = tconst_res.group(0)
                            award_record['movie'] = award_info.text
                result.append(award_record)

            return result

In [40]:
# Set the start index and end index. Ip is renewed after each chunk of urls of 'step_size'. 
# After every chunk, result is written to a pickle. So if you need to stop the execution in the middle, note the
# index range of the last successfully written pickle file (from the printed logs) and use the remaining range
# for start and end index to resume from where you stopped.
postfix = 'actors_subset'
s_idx = 0
e_idx = len(stars_subset)
# e_idx = 2000
step_size = 2000
pickles_save_dir = './pickles/{0}'.format(postfix)

base_url = 'https://www.imdb.com/name/{0}/awards'
urls=[]
for nconst in stars_subset.index[s_idx:e_idx]:
    urls.append(base_url.format(nconst))

used_ips = []
failed_urls = []


session = get_tor_session()
chunk_indeces = np.arange(0, len(urls), step_size)

for idx, start in enumerate(chunk_indeces, start=1):
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        start_t = time.time()
        finals = []
        renew_connection()

        session = get_tor_session()
        new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            session = get_tor_session()
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)

        url_chunk = urls[start:start + step_size] + failed_urls
        failed_urls = []

        future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                finals += data
            except Exception as exc:
                failed_urls.append(url.strip("'"))
                print('%s generated an exception: %s. Added to retry list.' % (url, exc))

        concurrent.futures.wait(
            list(future_to_url.keys()), 
            timeout=None, 
            return_when=concurrent.futures.ALL_COMPLETED)
        end_t = time.time()
        print("chunk {0} of {1} completed with IP {2}. Took {3} seconds."\
              .format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
        actors_df = pd.DataFrame(data=finals)
        if not os.path.exists(pickles_save_dir):
            os.makedirs(pickles_save_dir)
        pickle_path = "{4}/{0}_{1}-{2}_{3}.pkl"\
                        .format(idx, start, start + step_size, postfix, pickles_save_dir)
        actors_df.to_pickle(pickle_path)
        print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))

if len(failed_urls) > 0:
    print('\n\n\n\n!!!**********************************!!!')
    print('Failed to retrieve the following URLs.\n')
    print(failed_urls)
    print('\n!!!**********************************!!!')
    print('Note: Run the following cell to retry the failed urls')
else:
    print("\n\n\n\n!!!**********************************!!!\n")
    print("Scraping succeeded!")
    print('\n!!!**********************************!!!')
    print('''Note: Next, run the script that combine all pickles''')

chunk 1 of 26 completed with IP 185.220.101.57. Took 144.4004831314087 seconds.
Wrote pickle ./pickles/actors_subset/1_0-2000_actors_subset.pkl with 18075 rows
chunk 2 of 26 completed with IP 212.21.66.6. Took 148.19668579101562 seconds.
Wrote pickle ./pickles/actors_subset/2_2000-4000_actors_subset.pkl with 16817 rows
chunk 3 of 26 completed with IP 192.42.116.18. Took 222.7056758403778 seconds.
Wrote pickle ./pickles/actors_subset/3_4000-6000_actors_subset.pkl with 12199 rows
chunk 4 of 26 completed with IP 212.47.226.52. Took 119.3439691066742 seconds.
Wrote pickle ./pickles/actors_subset/4_6000-8000_actors_subset.pkl with 11179 rows
https://www.imdb.com/name/nm0354913/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0354913/awards. Added to retry list.
https://www.imdb.com/name/nm0468130/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0468130/awards. Added to retry lis

https://www.imdb.com/name/nm0275928/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0275928/awards. Added to retry list.
https://www.imdb.com/name/nm0904962/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0904962/awards. Added to retry list.
https://www.imdb.com/name/nm0653941/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0653941/awards. Added to retry list.
https://www.imdb.com/name/nm0619053/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0619053/awards. Added to retry list.
https://www.imdb.com/name/nm0586129/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0586129/awards. Added to retry list.
https://www.imdb.com/name/nm0470514/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0865411/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0865411/awards. Added to retry list.
https://www.imdb.com/name/nm0064957/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0064957/awards. Added to retry list.
https://www.imdb.com/name/nm0190827/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0190827/awards. Added to retry list.
https://www.imdb.com/name/nm0865396/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0865396/awards. Added to retry list.
https://www.imdb.com/name/nm0417923/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0417923/awards. Added to retry list.
https://www.imdb.com/name/nm2186174/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0795025/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0795025/awards. Added to retry list.
https://www.imdb.com/name/nm0451699/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0451699/awards. Added to retry list.
https://www.imdb.com/name/nm0283932/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0283932/awards. Added to retry list.
https://www.imdb.com/name/nm0237985/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0237985/awards. Added to retry list.
https://www.imdb.com/name/nm0628647/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0628647/awards. Added to retry list.
https://www.imdb.com/name/nm0600270/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0287471/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0287471/awards. Added to retry list.
https://www.imdb.com/name/nm0649563/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0649563/awards. Added to retry list.
https://www.imdb.com/name/nm0068755/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0068755/awards. Added to retry list.
https://www.imdb.com/name/nm0252603/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0252603/awards. Added to retry list.
https://www.imdb.com/name/nm0781958/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0781958/awards. Added to retry list.
https://www.imdb.com/name/nm0276578/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0668734/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0668734/awards. Added to retry list.
https://www.imdb.com/name/nm0272194/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0272194/awards. Added to retry list.
https://www.imdb.com/name/nm1974002/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1974002/awards. Added to retry list.
https://www.imdb.com/name/nm0645516/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0645516/awards. Added to retry list.
https://www.imdb.com/name/nm0614026/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0614026/awards. Added to retry list.
https://www.imdb.com/name/nm0628002/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0438027/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0438027/awards. Added to retry list.
https://www.imdb.com/name/nm0438022/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0438022/awards. Added to retry list.
https://www.imdb.com/name/nm0061223/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0061223/awards. Added to retry list.
https://www.imdb.com/name/nm1286895/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1286895/awards. Added to retry list.
https://www.imdb.com/name/nm0439202/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0439202/awards. Added to retry list.
https://www.imdb.com/name/nm0437997/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0124583/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0124583/awards. Added to retry list.
https://www.imdb.com/name/nm0639613/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0639613/awards. Added to retry list.
https://www.imdb.com/name/nm0527099/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0527099/awards. Added to retry list.
https://www.imdb.com/name/nm0795025/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0795025/awards. Added to retry list.
https://www.imdb.com/name/nm1065461/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1065461/awards. Added to retry list.
https://www.imdb.com/name/nm0283932/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0300353/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0300353/awards. Added to retry list.
https://www.imdb.com/name/nm0457556/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0457556/awards. Added to retry list.
https://www.imdb.com/name/nm0903939/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0903939/awards. Added to retry list.
https://www.imdb.com/name/nm1011131/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1011131/awards. Added to retry list.
https://www.imdb.com/name/nm0620325/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0620325/awards. Added to retry list.
https://www.imdb.com/name/nm0457557/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0628770/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0628770/awards. Added to retry list.
https://www.imdb.com/name/nm0437140/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0437140/awards. Added to retry list.
https://www.imdb.com/name/nm0630294/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0630294/awards. Added to retry list.
https://www.imdb.com/name/nm0631798/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0631798/awards. Added to retry list.
https://www.imdb.com/name/nm0067637/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0067637/awards. Added to retry list.
https://www.imdb.com/name/nm0019627/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0620325/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0620325/awards. Added to retry list.
https://www.imdb.com/name/nm0613147/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0613147/awards. Added to retry list.
https://www.imdb.com/name/nm1011131/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1011131/awards. Added to retry list.
https://www.imdb.com/name/nm0275941/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0275941/awards. Added to retry list.
https://www.imdb.com/name/nm0537551/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0537551/awards. Added to retry list.
https://www.imdb.com/name/nm0131469/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0697515/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0697515/awards. Added to retry list.
https://www.imdb.com/name/nm0872164/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0872164/awards. Added to retry list.
https://www.imdb.com/name/nm0029754/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0029754/awards. Added to retry list.
https://www.imdb.com/name/nm1290978/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1290978/awards. Added to retry list.
https://www.imdb.com/name/nm0872695/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0872695/awards. Added to retry list.
https://www.imdb.com/name/nm0487110/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0782620/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0782620/awards. Added to retry list.
https://www.imdb.com/name/nm0882096/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0882096/awards. Added to retry list.
https://www.imdb.com/name/nm0883006/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0883006/awards. Added to retry list.
https://www.imdb.com/name/nm0060694/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0060694/awards. Added to retry list.
https://www.imdb.com/name/nm0906735/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0906735/awards. Added to retry list.
https://www.imdb.com/name/nm1090743/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0147352/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0147352/awards. Added to retry list.
https://www.imdb.com/name/nm0162244/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0162244/awards. Added to retry list.
chunk 12 of 26 completed with IP 185.220.100.255. Took 106.37432336807251 seconds.
Wrote pickle ./pickles/actors_subset/12_22000-24000_actors_subset.pkl with 4582 rows
chunk 13 of 26 completed with IP 89.234.157.254. Took 156.56120204925537 seconds.
Wrote pickle ./pickles/actors_subset/13_24000-26000_actors_subset.pkl with 6303 rows
chunk 14 of 26 completed with IP 23.129.64.151. Took 499.6644570827484 seconds.
Wrote pickle ./pickles/actors_subset/14_26000-28000_actors_subset.pkl with 2666 rows
https://www.imdb.com/name/nm0887695/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm

https://www.imdb.com/name/nm0725030/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0725030/awards. Added to retry list.
https://www.imdb.com/name/nm0879218/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0879218/awards. Added to retry list.
https://www.imdb.com/name/nm0098396/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0098396/awards. Added to retry list.
https://www.imdb.com/name/nm0825492/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0825492/awards. Added to retry list.
https://www.imdb.com/name/nm0658029/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0658029/awards. Added to retry list.
https://www.imdb.com/name/nm0658498/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0722826/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0722826/awards. Added to retry list.
https://www.imdb.com/name/nm0876947/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0876947/awards. Added to retry list.
https://www.imdb.com/name/nm0659521/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0659521/awards. Added to retry list.
https://www.imdb.com/name/nm0660657/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0660657/awards. Added to retry list.
https://www.imdb.com/name/nm0168312/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0168312/awards. Added to retry list.
https://www.imdb.com/name/nm0660415/awards generated an exception: 503 Server Error: Service Unavailable 

chunk 15 of 26 completed with IP 104.244.78.55. Took 87.69530391693115 seconds.
Wrote pickle ./pickles/actors_subset/15_28000-30000_actors_subset.pkl with 2576 rows
https://www.imdb.com/name/nm0830751/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0830751/awards. Added to retry list.
https://www.imdb.com/name/nm0141482/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0141482/awards. Added to retry list.
https://www.imdb.com/name/nm0143285/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0143285/awards. Added to retry list.
https://www.imdb.com/name/nm0710770/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0710770/awards. Added to retry list.
https://www.imdb.com/name/nm0837098/awards generated an exception: 503 Server Error: Service Unavailable for url: https

https://www.imdb.com/name/nm0677886/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0677886/awards. Added to retry list.
https://www.imdb.com/name/nm0093681/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0093681/awards. Added to retry list.
https://www.imdb.com/name/nm0082405/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0082405/awards. Added to retry list.
https://www.imdb.com/name/nm0864709/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0864709/awards. Added to retry list.
https://www.imdb.com/name/nm0864653/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0864653/awards. Added to retry list.
https://www.imdb.com/name/nm0082380/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0671310/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0671310/awards. Added to retry list.
https://www.imdb.com/name/nm0714941/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0714941/awards. Added to retry list.
https://www.imdb.com/name/nm0095260/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0095260/awards. Added to retry list.
https://www.imdb.com/name/nm0714879/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0714879/awards. Added to retry list.
https://www.imdb.com/name/nm0671326/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0671326/awards. Added to retry list.
https://www.imdb.com/name/nm0162569/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0889753/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0889753/awards. Added to retry list.
https://www.imdb.com/name/nm0178507/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0178507/awards. Added to retry list.
https://www.imdb.com/name/nm0134922/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0134922/awards. Added to retry list.
https://www.imdb.com/name/nm0889844/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0889844/awards. Added to retry list.
https://www.imdb.com/name/nm0736688/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0736688/awards. Added to retry list.
https://www.imdb.com/name/nm0646419/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0825287/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0825287/awards. Added to retry list.
https://www.imdb.com/name/nm0879239/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0879239/awards. Added to retry list.
https://www.imdb.com/name/nm0657190/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0657190/awards. Added to retry list.
https://www.imdb.com/name/nm0169186/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0169186/awards. Added to retry list.
https://www.imdb.com/name/nm0825361/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0825361/awards. Added to retry list.
https://www.imdb.com/name/nm0077086/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0168910/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0168910/awards. Added to retry list.
https://www.imdb.com/name/nm0659173/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0659173/awards. Added to retry list.
https://www.imdb.com/name/nm0826628/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0826628/awards. Added to retry list.
chunk 16 of 26 completed with IP 109.70.100.23. Took 92.6700668334961 seconds.
Wrote pickle ./pickles/actors_subset/16_30000-32000_actors_subset.pkl with 2533 rows
https://www.imdb.com/name/nm6327190/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm6327190/awards. Added to retry list.
https://www.imdb.com/name/nm0790689/awards generated an exception: 503 Server Error: Service Unavailable for url: https:

https://www.imdb.com/name/nm0834496/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0834496/awards. Added to retry list.
https://www.imdb.com/name/nm0082653/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0082653/awards. Added to retry list.
https://www.imdb.com/name/nm0677886/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0677886/awards. Added to retry list.
https://www.imdb.com/name/nm0671310/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0671310/awards. Added to retry list.
https://www.imdb.com/name/nm0671326/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0671326/awards. Added to retry list.
https://www.imdb.com/name/nm0714941/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0646419/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646419/awards. Added to retry list.
https://www.imdb.com/name/nm0646045/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646045/awards. Added to retry list.
https://www.imdb.com/name/nm0176073/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0176073/awards. Added to retry list.
https://www.imdb.com/name/nm0176226/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0176226/awards. Added to retry list.
https://www.imdb.com/name/nm0646583/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646583/awards. Added to retry list.
https://www.imdb.com/name/nm0176161/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0076595/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0076595/awards. Added to retry list.
https://www.imdb.com/name/nm0657420/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0657420/awards. Added to retry list.
https://www.imdb.com/name/nm0076613/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0076613/awards. Added to retry list.
https://www.imdb.com/name/nm0076149/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0076149/awards. Added to retry list.
https://www.imdb.com/name/nm0879672/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0879672/awards. Added to retry list.
https://www.imdb.com/name/nm0657190/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0098118/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0098118/awards. Added to retry list.
https://www.imdb.com/name/nm0723854/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0723854/awards. Added to retry list.
https://www.imdb.com/name/nm0076828/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0076828/awards. Added to retry list.
https://www.imdb.com/name/nm0139949/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0139949/awards. Added to retry list.
https://www.imdb.com/name/nm0097967/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0097967/awards. Added to retry list.
https://www.imdb.com/name/nm0139554/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0396884/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0396884/awards. Added to retry list.
https://www.imdb.com/name/nm0396800/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0396800/awards. Added to retry list.
https://www.imdb.com/name/nm0813633/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0813633/awards. Added to retry list.
https://www.imdb.com/name/nm2563995/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm2563995/awards. Added to retry list.
https://www.imdb.com/name/nm0369873/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0369873/awards. Added to retry list.
https://www.imdb.com/name/nm0797714/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0739115/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0739115/awards. Added to retry list.
https://www.imdb.com/name/nm0797569/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0797569/awards. Added to retry list.
https://www.imdb.com/name/nm0369229/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0369229/awards. Added to retry list.
https://www.imdb.com/name/nm2533079/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm2533079/awards. Added to retry list.
https://www.imdb.com/name/nm0760641/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0760641/awards. Added to retry list.
https://www.imdb.com/name/nm0398543/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0709986/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0709986/awards. Added to retry list.
https://www.imdb.com/name/nm0837750/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0837750/awards. Added to retry list.
https://www.imdb.com/name/nm0671987/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0671987/awards. Added to retry list.
https://www.imdb.com/name/nm0868668/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0868668/awards. Added to retry list.
https://www.imdb.com/name/nm0868670/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0868670/awards. Added to retry list.
https://www.imdb.com/name/nm0713614/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0868123/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0868123/awards. Added to retry list.
https://www.imdb.com/name/nm0134922/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0134922/awards. Added to retry list.
https://www.imdb.com/name/nm0178507/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0178507/awards. Added to retry list.
https://www.imdb.com/name/nm0646419/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646419/awards. Added to retry list.
https://www.imdb.com/name/nm0161946/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0161946/awards. Added to retry list.
https://www.imdb.com/name/nm0646045/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0825361/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0825361/awards. Added to retry list.
https://www.imdb.com/name/nm0659601/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0659601/awards. Added to retry list.
https://www.imdb.com/name/nm0660657/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0660657/awards. Added to retry list.
https://www.imdb.com/name/nm0660415/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0660415/awards. Added to retry list.
https://www.imdb.com/name/nm0077381/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0077381/awards. Added to retry list.
https://www.imdb.com/name/nm0168484/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0727850/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0727850/awards. Added to retry list.
https://www.imdb.com/name/nm0727941/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0727941/awards. Added to retry list.
https://www.imdb.com/name/nm0794140/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0794140/awards. Added to retry list.
https://www.imdb.com/name/nm1975010/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1975010/awards. Added to retry list.
https://www.imdb.com/name/nm0728610/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0728610/awards. Added to retry list.
https://www.imdb.com/name/nm0363545/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0795313/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0795313/awards. Added to retry list.
https://www.imdb.com/name/nm2103483/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm2103483/awards. Added to retry list.
https://www.imdb.com/name/nm0407548/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0407548/awards. Added to retry list.
https://www.imdb.com/name/nm2027619/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm2027619/awards. Added to retry list.
https://www.imdb.com/name/nm0763330/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0763330/awards. Added to retry list.
https://www.imdb.com/name/nm0821018/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0397219/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0397219/awards. Added to retry list.
https://www.imdb.com/name/nm0396884/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0396884/awards. Added to retry list.
https://www.imdb.com/name/nm0369760/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0369760/awards. Added to retry list.
https://www.imdb.com/name/nm2578861/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm2578861/awards. Added to retry list.
https://www.imdb.com/name/nm2569308/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm2569308/awards. Added to retry list.
https://www.imdb.com/name/nm0396216/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0144957/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0144957/awards. Added to retry list.
https://www.imdb.com/name/nm0711777/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0711777/awards. Added to retry list.
https://www.imdb.com/name/nm0675790/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0675790/awards. Added to retry list.
https://www.imdb.com/name/nm0162987/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0162987/awards. Added to retry list.
https://www.imdb.com/name/nm1578769/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1578769/awards. Added to retry list.
https://www.imdb.com/name/nm0082405/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0646419/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646419/awards. Added to retry list.
https://www.imdb.com/name/nm0176073/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0176073/awards. Added to retry list.
https://www.imdb.com/name/nm0646045/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646045/awards. Added to retry list.
https://www.imdb.com/name/nm0646583/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0646583/awards. Added to retry list.
https://www.imdb.com/name/nm0733500/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0733500/awards. Added to retry list.
https://www.imdb.com/name/nm0176226/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0077086/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0077086/awards. Added to retry list.
https://www.imdb.com/name/nm0678230/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0678230/awards. Added to retry list.
https://www.imdb.com/name/nm0825361/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0825361/awards. Added to retry list.
https://www.imdb.com/name/nm0659601/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0659601/awards. Added to retry list.
https://www.imdb.com/name/nm0660657/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0660657/awards. Added to retry list.
https://www.imdb.com/name/nm0077381/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm1175038/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1175038/awards. Added to retry list.
https://www.imdb.com/name/nm0476838/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0476838/awards. Added to retry list.
https://www.imdb.com/name/nm0677168/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0677168/awards. Added to retry list.
https://www.imdb.com/name/nm1174966/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm1174966/awards. Added to retry list.
https://www.imdb.com/name/nm0677320/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0677320/awards. Added to retry list.
https://www.imdb.com/name/nm0476776/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0322707/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0322707/awards. Added to retry list.
https://www.imdb.com/name/nm0073146/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0073146/awards. Added to retry list.
https://www.imdb.com/name/nm0248885/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0248885/awards. Added to retry list.
https://www.imdb.com/name/nm0215648/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0215648/awards. Added to retry list.
https://www.imdb.com/name/nm0155642/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0155642/awards. Added to retry list.
https://www.imdb.com/name/nm0270248/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0185946/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0185946/awards. Added to retry list.
https://www.imdb.com/name/nm0003155/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0003155/awards. Added to retry list.
https://www.imdb.com/name/nm0043234/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0043234/awards. Added to retry list.
https://www.imdb.com/name/nm0103854/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0103854/awards. Added to retry list.
https://www.imdb.com/name/nm0248698/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0248698/awards. Added to retry list.
https://www.imdb.com/name/nm0156001/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0315882/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0315882/awards. Added to retry list.
https://www.imdb.com/name/nm0219453/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0219453/awards. Added to retry list.
https://www.imdb.com/name/nm0044140/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0044140/awards. Added to retry list.
https://www.imdb.com/name/nm0036834/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0036834/awards. Added to retry list.
https://www.imdb.com/name/nm0315942/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0315942/awards. Added to retry list.
https://www.imdb.com/name/nm0044129/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0031468/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0031468/awards. Added to retry list.
https://www.imdb.com/name/nm0280615/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0280615/awards. Added to retry list.
https://www.imdb.com/name/nm0131965/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0131965/awards. Added to retry list.
https://www.imdb.com/name/nm0242090/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0242090/awards. Added to retry list.
https://www.imdb.com/name/nm0280584/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0280584/awards. Added to retry list.
https://www.imdb.com/name/nm0304023/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0299796/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0299796/awards. Added to retry list.
https://www.imdb.com/name/nm0119782/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0119782/awards. Added to retry list.
https://www.imdb.com/name/nm0299717/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0299717/awards. Added to retry list.
https://www.imdb.com/name/nm0130046/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0130046/awards. Added to retry list.
https://www.imdb.com/name/nm0057942/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0057942/awards. Added to retry list.
https://www.imdb.com/name/nm0229594/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0298619/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0298619/awards. Added to retry list.
https://www.imdb.com/name/nm0057500/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0057500/awards. Added to retry list.
https://www.imdb.com/name/nm0170969/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0170969/awards. Added to retry list.
https://www.imdb.com/name/nm0230884/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0230884/awards. Added to retry list.
https://www.imdb.com/name/nm0230863/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0230863/awards. Added to retry list.
https://www.imdb.com/name/nm0171041/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0186232/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0186232/awards. Added to retry list.
https://www.imdb.com/name/nm0216152/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0216152/awards. Added to retry list.
https://www.imdb.com/name/nm0321108/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0321108/awards. Added to retry list.
https://www.imdb.com/name/nm0103854/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0103854/awards. Added to retry list.
https://www.imdb.com/name/nm0043234/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0043234/awards. Added to retry list.
https://www.imdb.com/name/nm0003155/awards generated an exception: 503 Server Error: Service Unavailable 

https://www.imdb.com/name/nm0181963/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0181963/awards. Added to retry list.
https://www.imdb.com/name/nm0044206/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0044206/awards. Added to retry list.
https://www.imdb.com/name/nm0004450/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0004450/awards. Added to retry list.
https://www.imdb.com/name/nm0315882/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0315882/awards. Added to retry list.
https://www.imdb.com/name/nm0044140/awards generated an exception: 503 Server Error: Service Unavailable for url: https://www.imdb.com/name/nm0044140/awards. Added to retry list.
https://www.imdb.com/name/nm0219453/awards generated an exception: 503 Server Error: Service Unavailable 

In [None]:
'''
Run the following script only if there were any failed URLS at the end of the previous script execution.
Run this several times if you still see failures until there are none. But if some dont succeed even 
after multiple attempts, its probably a problem on imdb side and we can just ignore them.
'''
# urls = failed_urls

# if len(urls) > 0:
#     retry_failed_urls = []

#     with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
#         session = get_tor_session(None)
#         chunk_indeces = np.arange(0, len(urls), step_size)

#         for idx, start in enumerate(chunk_indeces, start=1):
#             start_t = time.time()
#             finals = []
#             renew_connection()
#             session = get_tor_session(session)
#             new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
#             while new_ip in used_ips:
#                 print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
#                 time.sleep(5)
#                 renew_connection()
#                 session = get_tor_session(session)
#                 new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
#             used_ips.append(new_ip)

#             url_chunk = urls[start:start + step_size] + retry_failed_urls
#             retry_failed_urls = []

#             future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
#             for future in concurrent.futures.as_completed(future_to_url):
#                 url = future_to_url[future]
#                 try:
#                     data = future.result()
#                     finals.append(data)
#                 except Exception as exc:
#                     retry_failed_urls.append(url.strip("'"))
#                     print('%s generated an exception: %s. Added to retry list.' % (url, exc))

#             concurrent.futures.wait(list(future_to_url.keys()), timeout=None, return_when=concurrent.futures.ALL_COMPLETED)
#             end_t = time.time()
#             print("chunk {0} of {1} completed with IP {2}. Took {3} seconds.".format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
#             actors_df = pd.DataFrame(data=finals)
#             if len(actors_df) > 0:
#                 pickle_path = "./pickles/actortor/{0}_{1}-{2}_{3}_retry.pkl".format(idx, start, start + step_size, postfix)
#                 actors_df.to_pickle(pickle_path)
#                 print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))

#         if len(retry_failed_urls) > 0:
#             print('\n\n\n\n!!!**********************************!!!')
#             print('Retry failed with the following URLs.\n')
#             print(retry_failed_urls)
#             print('\n!!!**********************************!!!')
#             print('Note: Run this script several times to see if all succeed.',
#             'If some dont, it probably means they wont ever. so just give them up.')
#         else:
#             print("\n\n\n\n!!!**********************************!!!\n")
#             print("Retry succeeded!")
#             print('\n!!!**********************************!!!')
#             print('''Note: Run the following script to combine all pickles''')

In [41]:
# Combine the created pickles into one Dataframe
# Ignore any sytem files starting with . or folders if any
pickles = [f for f in listdir(pickles_save_dir) if isfile(join(pickles_save_dir, f)) and not f.startswith('.')]

dfs = list(map(lambda x: pd.read_pickle("{0}/{1}".format(pickles_save_dir,x)), pickles))
combined_df = pd.concat(dfs)
combined_df.reset_index(drop=True, inplace=True)
if not os.path.exists('./pickles'):
    os.makedirs('./pickles')
combined_df.to_pickle("./pickles/complete_{1}_{0}.pkl".format(e_idx, postfix))
len(combined_df)

139613

IMDB doesn't have awards information on many of the actors. So the number of actors in this final set is smaller than the original chosen subser

In [49]:
len(combined_df.nconst.unique())

15567

In [70]:
actors_original = set(stars_counts.index)
actors_final = set(combined_df.groupby('nconst').count().index)

In [71]:
actors_original.difference(actors_final)

{'nm2748018',
 'nm1155983',
 'nm0241743',
 'nm2507126',
 'nm1269330',
 'nm0222341',
 'nm0224047',
 'nm0308218',
 'nm0469843',
 'nm0829067',
 'nm0634962',
 'nm0642077',
 'nm0005422',
 'nm0867964',
 'nm9208548',
 'nm0616143',
 'nm0066159',
 'nm1128394',
 'nm3770646',
 'nm0904154',
 'nm0374532',
 'nm0684433',
 'nm0315905',
 'nm0336222',
 'nm0233180',
 'nm1207975',
 'nm0160271',
 'nm0174495',
 'nm10100220',
 'nm0875000',
 'nm1282377',
 'nm0582586',
 'nm0694879',
 'nm2275592',
 'nm1239461',
 'nm0244596',
 'nm1002871',
 'nm0149575',
 'nm1416073',
 'nm0049413',
 'nm0686875',
 'nm1717257',
 'nm0463575',
 'nm0724900',
 'nm6478127',
 'nm1914112',
 'nm0010433',
 'nm0193104',
 'nm1696866',
 'nm0900877',
 'nm0001146',
 'nm1298009',
 'nm0645811',
 'nm0147721',
 'nm1090785',
 'nm1616450',
 'nm0880325',
 'nm3153141',
 'nm0153508',
 'nm0952026',
 'nm0755716',
 'nm0240076',
 'nm0232621',
 'nm0328600',
 'nm0896424',
 'nm0917733',
 'nm0380395',
 'nm0241767',
 'nm0381858',
 'nm0698346',
 'nm0754431',
 'nm1

Awards of one random actor

In [58]:
combined_df[combined_df.nconst == 'nm0003501']

Unnamed: 0,nconst,year,category,w_n,description,movie,tconst
112000,nm0026364,2019,Image Award,Winner,Outstanding Actor in a Comedy Series,Black-ish,tt3487356
112001,nm0026364,2018,Image Award,Winner,Outstanding Actor in a Comedy Series,Black-ish,tt3487356
112002,nm0026364,2017,Image Award,Winner,Outstanding Actor in a Comedy Series,Black-ish,tt3487356
112003,nm0026364,2017,Image Award,Nominee,"Outstanding Host in a News, Talk, Reality, or ...",BET Awards 2016,tt5847012
112004,nm0026364,2016,Image Award,Winner,Outstanding Actor in a Comedy Series,Black-ish,tt3487356
112005,nm0026364,2015,Image Award,Winner,Outstanding Actor in a Comedy Series,Black-ish,tt3487356
112006,nm0026364,2013,Image Award,Nominee,Outstanding Actor in a Comedy Series,Guys with Kids,tt2281583
112007,nm0026364,2011,Image Award,Nominee,Outstanding Actor in a Drama Series,Law & Order,tt0098844
112008,nm0026364,2010,Image Award,Nominee,Outstanding Actor in a Drama Series,Law & Order,tt0098844
112009,nm0026364,2009,Image Award,Nominee,Outstanding Actor in a Drama Series,Law & Order,tt0098844
