In [1]:
import re
import ast
import time
import requests
import pandas as pd
import numpy as np
import os.path
import urllib.request
import concurrent.futures
from os import listdir
from stem import Signal
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import isfile, join
from stem.control import Controller

In [2]:
def get_tor_session():
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")
        controller.signal(Signal.NEWNYM)

In [3]:
def split_profession(str):
    return str.split(',')

In [4]:
if os.path.exists('./pickles/filtered_actors.pkl'):
    print("Reading from pickle...")
    actors = pd.read_pickle("./pickles/filtered_actors.pkl")
else:
    people_df = pd.read_csv(
        './data/name.basics.tsv.gz',
        sep='\t',
        converters={'primaryProfession': split_profession}
    )
    people_prof_exploded = people_df.explode('primaryProfession')
    actors = people_prof_exploded[
        (people_prof_exploded.primaryProfession == 'actress') | (people_prof_exploded.primaryProfession == 'actor')]
    actors.to_pickle("./pickles/filtered_actors.pkl")
actors.head()

Reading from pickle...


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,actor,"tt0050419,tt0053137,tt0043044,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,actress,"tt0117057,tt0037382,tt0071877,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,actress,"tt0049189,tt0059956,tt0054452,tt0057345"
3,nm0000004,John Belushi,1949,1982,actor,"tt0078723,tt0080455,tt0072562,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,actor,"tt0050986,tt0083922,tt0069467,tt0050976"


In [5]:
print(len(actors))

3626035


In [6]:
def get_data(session, url):
    nconst=url.rsplit('/', 2)[-2]
    result = {
        'nconst': nconst,
        'year': None,
        'category': None,
        'w_n': None,
        'description': None,
        'movie': None,
        'tconst': None 
    }
    
    req = session.get(url)
    req.raise_for_status()    
    
    body=req.text
    
    soup=BeautifulSoup(body,'html.parser')
    awards=soup.find_all('tr')
    if awards is not None:
        for award in awards:
            if award.find('td', class_='award_year') is not None:
                if award.find('td', class_='award_year').find('a') is not None:
                    year = award.find('td', class_='award_year').find('a').text
                    if year is not None:
                        try:
                            year = int(year.replace("\n", "").strip())
                            result['year'] = year
                        except:
                            print("Failed to parse int year {0}".format(year))
                        
                    
    
            if award.find('span',class_='award_category') is not None:
                result['category'] = award.find('span',class_='award_category').text
        
            if award.find('td',class_="award_outcome") is not None:
                outcome = award.find('td',class_="award_outcome").find('b')
                if outcome is not None:
                    outcome_txt = outcome.text
                    if outcome_txt is not None:
                        result['w_n'] = outcome_txt.replace("\n", "").strip()
        
            if award.find('td', class_='award_description') is not None:
                award_txt = award.find('td',class_='award_description').find(text=True, recursive=False)
                award_info = award.find('td',class_='award_description').find('a')
                if award_txt is not None:
                    result['description'] = award_txt.replace("\n", "").strip()
                
                if award_info is not None:
                    result['movie'] = award_info.text
                    tconst_res = re.search('tt\d{7}', award_info.get('href'))
                    if tconst_res is not None:
                        result['tconst'] = tconst_res.group(0)
        
        return result
    else:
        return


In [7]:
# Set the start index and end index. Ip is renewed after each chunk of urls of 'step_size'. 
# After every chunk, result is written to a pickle. So if you need to stop the execution in the middle, note the
# index range of the last successfully written pickle file (from the printed logs) and use the remaining range
# for start and end index to resume from where you stopped.
postfix = 'actors_tor'
s_idx = 1211100
e_idx = 1212400
step_size = 600
# [0:3626035]

base_url = 'https://www.imdb.com/name/{0}/awards'
urls=[]
for index, row in actors[s_idx:e_idx].iterrows():
    urls.append(base_url.format(row['nconst']))

used_ips = []
failed_urls = []

with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
    session = get_tor_session()
    chunk_indeces = np.arange(0, len(urls), step_size)
    
    for idx, start in enumerate(chunk_indeces, start=1):
        start_t = time.time()
        finals = []
        renew_connection()
        session = get_tor_session()
        new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            session = get_tor_session()
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)
        
        url_chunk = urls[start:start + step_size] + failed_urls
        failed_urls = []
        
        future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                finals.append(data)
            except Exception as exc:
                failed_urls.append(url.strip("'"))
                print('%s generated an exception: %s. Added to retry list.' % (url, exc))
        
        concurrent.futures.wait(
            list(future_to_url.keys()), 
            timeout=None, 
            return_when=concurrent.futures.ALL_COMPLETED)
        end_t = time.time()
        print("chunk {0} of {1} completed with IP {2}. Took {3} seconds."\
              .format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
        actors_df = pd.DataFrame(data=finals)
        pickle_path = "./pickles/actortor/{0}_{1}-{2}_{3}.pkl"\
                        .format(idx, start, start + step_size, postfix)
        actors_df.to_pickle(pickle_path)
        print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))
        
    if len(failed_urls) > 0:
        print('\n\n\n\n!!!**********************************!!!')
        print('Failed to retrieve the following URLs.\n')
        print(failed_urls)
        print('\n!!!**********************************!!!')
        print('Note: Run the following cell to retry the failed urls')
    else:
        print("\n\n\n\n!!!**********************************!!!\n")
        print("Scraping succeeded!")
        print('\n!!!**********************************!!!')
        print('''Note: Next, run the script that combine all pickles''')

chunk 1 of 3 completed with IP 185.220.100.252. Took 35.22462797164917 seconds.
Wrote pickle ./pickles/actortor/1_0-600_actors_tor.pkl with 600 rows
chunk 2 of 3 completed with IP 5.199.135.107. Took 26.74442481994629 seconds.
Wrote pickle ./pickles/actortor/2_600-1200_actors_tor.pkl with 600 rows
chunk 3 of 3 completed with IP 195.176.3.19. Took 5.84783411026001 seconds.
Wrote pickle ./pickles/actortor/3_1200-1800_actors_tor.pkl with 100 rows




!!!**********************************!!!

Scraping succeeded!

!!!**********************************!!!
Note: Next, run the script that combine all pickles


In [118]:
'''
Run the following script only if there were any failed URLS at the end of the previous script execution.
Run this several times if you still see failures until there are none. But if some dont succeed even 
after multiple attempts, its probably a problem on imdb side and we can just ignore them.
'''
# urls = failed_urls

# if len(urls) > 0:
#     retry_failed_urls = []

#     with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
#         session = get_tor_session()
#         chunk_indeces = np.arange(0, len(urls), step_size)

#         for idx, start in enumerate(chunk_indeces, start=1):
#             start_t = time.time()
#             finals = []
#             renew_connection()
#             session = get_tor_session()
#             new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
#             while new_ip in used_ips:
#                 print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
#                 time.sleep(5)
#                 renew_connection()
#                 session = get_tor_session()
#                 new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
#             used_ips.append(new_ip)

#             url_chunk = urls[start:start + step_size] + retry_failed_urls
#             retry_failed_urls = []

#             future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
#             for future in concurrent.futures.as_completed(future_to_url):
#                 url = future_to_url[future]
#                 try:
#                     data = future.result()
#                     finals.append(data)
#                 except Exception as exc:
#                     retry_failed_urls.append(url.strip("'"))
#                     print('%s generated an exception: %s. Added to retry list.' % (url, exc))

#             concurrent.futures.wait(list(future_to_url.keys()), timeout=None, return_when=concurrent.futures.ALL_COMPLETED)
#             end_t = time.time()
#             print("chunk {0} of {1} completed with IP {2}. Took {3} seconds.".format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
#             actors_df = pd.DataFrame(data=finals)
#             if len(actors_df) > 0:
#                 pickle_path = "./pickles/actortor/{0}_{1}-{2}_{3}_retry.pkl".format(idx, start, start + step_size, postfix)
#                 actors_df.to_pickle(pickle_path)
#                 print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))

#         if len(retry_failed_urls) > 0:
#             print('\n\n\n\n!!!**********************************!!!')
#             print('Retry failed with the following URLs.\n')
#             print(retry_failed_urls)
#             print('\n!!!**********************************!!!')
#             print('Note: Run this script several times to see if all succeed.',
#             'If some dont, it probably means they wont ever. so just give them up.')
#         else:
#             print("\n\n\n\n!!!**********************************!!!\n")
#             print("Retry succeeded!")
#             print('\n!!!**********************************!!!')
#             print('''Note: Run the following script to combine all pickles''')

chunk 1 of 1 completed with IP 104.244.78.102. Took 1.212857961654663 seconds.
Wrote pickle ./pickles/actortor/1_0-10_actors_tor_retry.pkl with 1 rows




!!!**********************************!!!

Retry succeeded!

!!!**********************************!!!
Note: Run the following script to combine all pickles


In [123]:
# Combine the created pickles into one Dataframe

dir_path = "./pickles/actortor/"
# Ignore any sytem files starting with . or folders if any
pickles = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and not f.startswith('.')]

dfs = list(map(lambda x: pd.read_pickle("{0}/{1}".format(dir_path,x)), pickles))
combined_df = pd.concat(dfs)
combined_df.reset_index(drop=True, inplace=True)
combined_df.to_pickle("./pickles/complete_actor_{0}.pkl".format(e_idx))
len(combined_df)

51