In [1]:
import re
import ast
import time
import requests
import pandas as pd
import numpy as np
import os.path
import urllib.request
import concurrent.futures
from os import listdir
from stem import Signal
from requests import get
from bs4 import BeautifulSoup
from datetime import datetime
from os.path import isfile, join
from stem.control import Controller


In [2]:
movie_subset = pd.read_pickle("./pickles/complete_178118.pkl")
movie_subset.head()

Unnamed: 0,tconst,stars,oscarWins,nominations,wins,releaseDate,releaseCountry,plotKeywords,budget,worldwideGross,metascore,musicProducer
0,tt0014799,"[nm0265550, nm0370407, nm0550195]",0,0,0,1924-05-31,UK,[],,,,
1,tt0014843,"[nm0427659, nm0107574, nm0421138]",0,0,0,1924-08-24,USA,[],,,,
2,tt0014809,"[nm0267916, nm0119572, nm0055809]",0,0,0,1924-04-08,USA,[],,,,
3,tt0014751,"[nm0403710, nm0744408]",0,0,0,,,[],,,,
4,tt0014812,"[nm0556953, nm0531962, nm0645941]",0,0,0,1924-12-28,USA,[],,,,


In [3]:
stars_exploded = movie_subset.explode('stars')[['tconst', 'stars']]
stars_exploded.head()

Unnamed: 0,tconst,stars
0,tt0014799,nm0265550
0,tt0014799,nm0370407
0,tt0014799,nm0550195
1,tt0014843,nm0427659
1,tt0014843,nm0107574


In [4]:
stars_exploded.stars.nunique()

138245

In [5]:
stars_counts = stars_exploded\
                    .groupby('stars')\
                    .count()\
                    .rename(columns={"tconst": "count"})\
                    .sort_values(['count'], ascending=False)
stars_counts.index.name = 'nconst'

In [6]:
stars_counts.tail(87400) 

Unnamed: 0_level_0,count
nconst,Unnamed: 1_level_1
nm0323379,2
nm0098627,2
nm0073596,2
nm1054406,1
nm1030940,1
...,...
nm0670816,1
nm0163019,1
nm0670779,1
nm0670771,1


Close to 90000 stars have starred in only 1 movie in the chosen movie subset. For the purpose of bringing down the actors count for scraping,  for the moment, let's consider only the actors that have starred in more than 1 movie in the chosen movie subset. 


In [7]:
print("Jessica Alba: {0} movies".format(stars_counts.loc['nm0004695']['count']))
print("Billy Boyd (Pippin from LOTR): {0} movies".format(stars_counts.loc['nm0101710']['count']))
print("Will Smith: {0} movies".format(stars_counts.loc['nm0000226']['count']))
print("Morgan Freeman: {0} movies".format(stars_counts.loc['nm0000151']['count']))


Jessica Alba: 4 movies
Billy Boyd (Pippin from LOTR): 1 movies
Will Smith: 12 movies
Morgan Freeman: 26 movies


In [8]:
stars_subset = stars_counts[stars_counts['count'] > 1]
len(stars_subset)

50848

### Scraping

In [9]:
def get_tor_session(prev_session):
    session = requests.session()
    # Tor uses the 9050 port as the default socks port
    session.proxies = {'http':  'socks5://127.0.0.1:9050',
                       'https': 'socks5://127.0.0.1:9050'}
    return session

def renew_connection():
    with Controller.from_port(port = 9051) as controller:
        controller.authenticate(password="password")

def get_data(session, url):
    nconst=url.rsplit('/', 2)[-2]
    result = {
        'nconst': nconst,
        'year': None,
        'category': None,
        'w_n': None,
        'description': None,
        'movie': None,
        'tconst': None 
    }
    
    with session.get(url) as req:
        req.raise_for_status()    
    
        body=req.text

        soup=BeautifulSoup(body,'html.parser')
        awards=soup.find_all('tr')
        if awards is not None:
            for award in awards:
                if award.find('td', class_='award_year') is not None:
                    if award.find('td', class_='award_year').find('a') is not None:
                        year = award.find('td', class_='award_year').find('a').text
                        if year is not None:
                            try:
                                year = int(year.replace("\n", "").strip())
                                result['year'] = year
                            except:
                                print("Failed to parse int year {0}".format(year))



                if award.find('span',class_='award_category') is not None:
                    result['category'] = award.find('span',class_='award_category').text

                if award.find('td',class_="award_outcome") is not None:
                    outcome = award.find('td',class_="award_outcome").find('b')
                    if outcome is not None:
                        outcome_txt = outcome.text
                        if outcome_txt is not None:
                            result['w_n'] = outcome_txt.replace("\n", "").strip()

                if award.find('td', class_='award_description') is not None:
                    award_txt = award.find('td',class_='award_description').find(text=True, recursive=False)
                    award_info = award.find('td',class_='award_description').find('a')
                    if award_txt is not None:
                        result['description'] = award_txt.replace("\n", "").strip()

                    if award_info is not None:
                        result['movie'] = award_info.text
                        tconst_res = re.search('tt\d{7}', award_info.get('href'))
                        if tconst_res is not None:
                            result['tconst'] = tconst_res.group(0)

            return result

In [10]:
# Set the start index and end index. Ip is renewed after each chunk of urls of 'step_size'. 
# After every chunk, result is written to a pickle. So if you need to stop the execution in the middle, note the
# index range of the last successfully written pickle file (from the printed logs) and use the remaining range
# for start and end index to resume from where you stopped.
postfix = 'actors_subset'
s_idx = 0
e_idx = len(stars_subset)
step_size = 5000
pickles_save_dir = './pickles/{0}'.format(postfix)

base_url = 'https://www.imdb.com/name/{0}/awards'
urls=[]
for nconst in stars_subset.index[s_idx:e_idx]:
    urls.append(base_url.format(nconst))

used_ips = []
failed_urls = []

with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
#     session = get_tor_session(None)
    chunk_indeces = np.arange(0, len(urls), step_size)
    
    for idx, start in enumerate(chunk_indeces, start=1):
        start_t = time.time()
        finals = []
        renew_connection()
        
        new_ip = ''
        with requests.session() as session:
        # Tor uses the 9050 port as the default socks port
            session.proxies = {'http':  'socks5://127.0.0.1:9050',
                               'https': 'socks5://127.0.0.1:9050'}
#             session = get_tor_session(session)
            new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        while new_ip in used_ips:
            print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
            time.sleep(5)
            renew_connection()
            with requests.session() as session:
                session.proxies = {'http':  'socks5://127.0.0.1:9050',
                                   'https': 'socks5://127.0.0.1:9050'}
#             session = get_tor_session(session)
                new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
        used_ips.append(new_ip)

        url_chunk = urls[start:start + step_size] + failed_urls
        failed_urls = []

        with requests.session() as session:
            session.proxies = {'http':  'socks5://127.0.0.1:9050',
                                   'https': 'socks5://127.0.0.1:9050'}
            future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    data = future.result()
                    finals.append(data)
                except Exception as exc:
                    failed_urls.append(url.strip("'"))
                    print('%s generated an exception: %s. Added to retry list.' % (url, exc))

            concurrent.futures.wait(
                list(future_to_url.keys()), 
                timeout=None, 
                return_when=concurrent.futures.ALL_COMPLETED)
            end_t = time.time()
            print("chunk {0} of {1} completed with IP {2}. Took {3} seconds."\
                  .format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
            actors_df = pd.DataFrame(data=finals)
            if not os.path.exists(pickles_save_dir):
                os.makedirs(pickles_save_dir)
            pickle_path = "{4}/{0}_{1}-{2}_{3}.pkl"\
                            .format(idx, start, start + step_size, postfix, pickles_save_dir)
            actors_df.to_pickle(pickle_path)
            print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))
        
    if len(failed_urls) > 0:
        print('\n\n\n\n!!!**********************************!!!')
        print('Failed to retrieve the following URLs.\n')
        print(failed_urls)
        print('\n!!!**********************************!!!')
        print('Note: Run the following cell to retry the failed urls')
    else:
        print("\n\n\n\n!!!**********************************!!!\n")
        print("Scraping succeeded!")
        print('\n!!!**********************************!!!')
        print('''Note: Next, run the script that combine all pickles''')

https://www.imdb.com/name/nm0249982/awards generated an exception: [('system library', 'fopen', 'Too many open files'), ('BIO routines', 'BIO_new_file', 'system lib'), ('x509 certificate routines', 'X509_load_cert_crl_file', 'system lib')]. Added to retry list.
https://www.imdb.com/name/nm0823633/awards generated an exception: [('system library', 'fopen', 'Too many open files'), ('BIO routines', 'BIO_new_file', 'system lib'), ('x509 certificate routines', 'X509_load_cert_crl_file', 'system lib')]. Added to retry list.
https://www.imdb.com/name/nm0453520/awards generated an exception: [('system library', 'fopen', 'Too many open files'), ('BIO routines', 'BIO_new_file', 'system lib'), ('x509 certificate routines', 'X509_load_cert_crl_file', 'system lib')]. Added to retry list.
https://www.imdb.com/name/nm0004437/awards generated an exception: [('system library', 'fopen', 'Too many open files'), ('BIO routines', 'BIO_new_file', 'system lib'), ('x509 certificate routines', 'X509_load_cert_

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "<ipython-input-10-c91b4a8ee5a8>", line 53, in <module>
    for future in concurrent.futures.as_completed(future_to_url):
  File "/Users/ruchiranga/anaconda3/lib/python3.6/concurrent/futures/_base.py", line 240, in as_completed
    waiter.event.wait(wait_timeout)
  File "/Users/ruchiranga/anaconda3/lib/python3.6/threading.py", line 551, in wait
    signaled = self._cond.wait(timeout)
  File "/Users/ruchiranga/anaconda3/lib/python3.6/threading.py", line 295, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ruchiranga/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-c91b4a8ee5a8>", line 87, in <module>
    print('''Note: Next, run the script that combine all pickles''')
  File "/Users/ruchiranga/anaconda3/

TypeError: must be str, not list

Traceback (most recent call last):
  File "/Users/ruchiranga/anaconda3/lib/python3.6/site-packages/urllib3/connectionpool.py", line 265, in _put_conn
    self.pool.put(conn, block=False)
  File "/Users/ruchiranga/anaconda3/lib/python3.6/queue.py", line 130, in put
    raise Full
queue.Full

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ruchiranga/anaconda3/lib/python3.6/site-packages/ipykernel/iostream.py", line 97, in _event_pipe
    event_pipe = self._local.event_pipe
AttributeError: '_thread._local' object has no attribute 'event_pipe'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ruchiranga/anaconda3/lib/python3.6/logging/__init__.py", line 996, in emit
    stream.write(msg)
  File "/Users/ruchiranga/anaconda3/lib/python3.6/site-packages/ipykernel/iostream.py", line 402, in write
  File "/Users/ruchiranga/anaconda3/lib/python3.6/site-pac

In [None]:
'''
Run the following script only if there were any failed URLS at the end of the previous script execution.
Run this several times if you still see failures until there are none. But if some dont succeed even 
after multiple attempts, its probably a problem on imdb side and we can just ignore them.
'''
# urls = failed_urls

# if len(urls) > 0:
#     retry_failed_urls = []

#     with concurrent.futures.ThreadPoolExecutor(max_workers=150) as executor:
#         session = get_tor_session(None)
#         chunk_indeces = np.arange(0, len(urls), step_size)

#         for idx, start in enumerate(chunk_indeces, start=1):
#             start_t = time.time()
#             finals = []
#             renew_connection()
#             session = get_tor_session(session)
#             new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
#             while new_ip in used_ips:
#                 print("Renewed IP {0} already used. Waiting 5s to renew...".format(new_ip))
#                 time.sleep(5)
#                 renew_connection()
#                 session = get_tor_session(session)
#                 new_ip = ast.literal_eval(session.get("http://httpbin.org/ip").text)["origin"].split(",")[0]
#             used_ips.append(new_ip)

#             url_chunk = urls[start:start + step_size] + retry_failed_urls
#             retry_failed_urls = []

#             future_to_url = {executor.submit(get_data, session, url): url for url in url_chunk}
#             for future in concurrent.futures.as_completed(future_to_url):
#                 url = future_to_url[future]
#                 try:
#                     data = future.result()
#                     finals.append(data)
#                 except Exception as exc:
#                     retry_failed_urls.append(url.strip("'"))
#                     print('%s generated an exception: %s. Added to retry list.' % (url, exc))

#             concurrent.futures.wait(list(future_to_url.keys()), timeout=None, return_when=concurrent.futures.ALL_COMPLETED)
#             end_t = time.time()
#             print("chunk {0} of {1} completed with IP {2}. Took {3} seconds.".format(idx, len(chunk_indeces), new_ip, end_t - start_t))   
#             actors_df = pd.DataFrame(data=finals)
#             if len(actors_df) > 0:
#                 pickle_path = "./pickles/actortor/{0}_{1}-{2}_{3}_retry.pkl".format(idx, start, start + step_size, postfix)
#                 actors_df.to_pickle(pickle_path)
#                 print("Wrote pickle {0} with {1} rows".format(pickle_path, len(actors_df)))

#         if len(retry_failed_urls) > 0:
#             print('\n\n\n\n!!!**********************************!!!')
#             print('Retry failed with the following URLs.\n')
#             print(retry_failed_urls)
#             print('\n!!!**********************************!!!')
#             print('Note: Run this script several times to see if all succeed.',
#             'If some dont, it probably means they wont ever. so just give them up.')
#         else:
#             print("\n\n\n\n!!!**********************************!!!\n")
#             print("Retry succeeded!")
#             print('\n!!!**********************************!!!')
#             print('''Note: Run the following script to combine all pickles''')

In [None]:
# Combine the created pickles into one Dataframe
# Ignore any sytem files starting with . or folders if any
pickles = [f for f in listdir(pickles_save_dir) if isfile(join(dir_path, f)) and not f.startswith('.')]

dfs = list(map(lambda x: pd.read_pickle("{0}/{1}".format(dir_path,x)), pickles))
combined_df = pd.concat(dfs)
combined_df.reset_index(drop=True, inplace=True)
if not os.path.exists('./pickles'):
    os.makedirs('./pickles')
combined_df.to_pickle("./pickles/complete_{1}_{0}.pkl".format(e_idx, postfix))
len(combined_df)