In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common import exceptions
import time
import random
import re
import pandas as pd
from multiprocessing import Pool
import multiprocessing

In [29]:
def get_links(products):
    url = []
    for p in products:
        url.append(p.find_element(By.TAG_NAME, value='a').get_attribute('href'))

    return url

In [30]:
def get_shows_info(url):
    CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'

    service = Service(executable_path=CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service)

    # Open new tab
    driver.get(url)

    # Show's title
    title = WebDriverWait(driver, 2).until(lambda x: x.find_element(By.CLASS_NAME, 'hero__primary-text')).get_attribute('innerHTML')

    # Show's start-end years
    try:
        years = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[2]/a').text
    except exceptions.NoSuchElementException:
        years = pd.NA

    # Shows's cerfitication
    try:
        certification = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]/a').text
    except exceptions.NoSuchElementException:
        certification = pd.NA

    # Show's runtime
    try:
        runtime = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[4]').text   
    except exceptions.NoSuchElementException:
        runtime = pd.NA
    
    # Show's rating
    try:
        rating = driver.find_element(By.XPATH, '//span[@class="sc-d541859f-1 imUuxf"]').get_attribute('innerHTML')
    except exceptions.NoSuchElementException:
        rating = pd.NA

    # Show's number of votes
    try:
        votes = driver.find_element(By.XPATH, '//div[@class="sc-d541859f-3 dwhNqC"]').get_attribute('innerHTML')   
    except exceptions.NoSuchElementException:
        votes = pd.NA

    # Show's emmy awards
    emmys = 0
    try:
        awards = driver.find_element(By.XPATH, '//li[@data-testid="award_information"]//a').get_attribute('innerHTML')
        awards = re.search(r'Won (\d{1,2}) Primetime Emmy', awards)
        emmys = int(awards.group(1)) if awards else 0
    except exceptions.NoSuchElementException:
        pass


    creators = []
    actors = []
    try:
        # Scroll to Cast section of the page
        title_cast = driver.find_element(By.XPATH, '//section[@data-testid="title-cast"]')  
        driver.execute_script("arguments[0].scrollIntoView();", title_cast)
        time.sleep(1)

        try:
            # Show's creators
            creator = title_cast.find_element(By.XPATH, './/ul[contains(@class, "ipc-metadata-list")]//ul')
            for c in creator.find_elements(By.TAG_NAME, 'a'):
                creators.append(c.get_attribute('innerHTML'))
        except exceptions.NoSuchElementException:
            pass
        
        # Show's actors
        for c in title_cast.find_elements(By.XPATH, './/div[@data-testid="title-cast-item"]'):
            try:
                actors.append(c.find_element(By.XPATH, './/a[@data-testid="title-cast-item__actor"]').get_attribute('innerHTML'))
            except exceptions.NoSuchElementException:
                pass
    except exceptions.NoSuchElementException:
        pass

    creators = ', '.join(creators) if creators else pd.NA
    actors = ', '.join(actors) if actors else pd.NA
        
    # Scroll to Storyline section of the page
    for _ in range(5):  
        try:
            storyline = WebDriverWait(driver, 2).until(lambda x: x.find_element(By.XPATH, '//li[@data-testid="storyline-genres"]'))
            driver.execute_script("arguments[0].scrollIntoView();", storyline)
            time.sleep(1)
            break
        except exceptions.TimeoutException:
            driver.execute_script("window.scrollBy(0, 1000);")
            time.sleep(1)

    # Show's genres
    time.sleep(1)
    genres = []
    for e in storyline.find_elements(By.TAG_NAME, 'a'):
        try:
            genres.append(e.get_attribute('innerHTML'))
        except exceptions.StaleElementReferenceException:
            print(title)
    genres = ', '.join(genres) if genres else pd.NA

    # Scroll to Details section of the page
    for _ in range(5):    
        try: 
            details = WebDriverWait(driver, 2).until(lambda x: x.find_element(By.XPATH, '//section[@data-testid="Details"]'))
            driver.execute_script("arguments[0].scrollIntoView();", details)
            time.sleep(1)  
            break
        except exceptions.TimeoutException:
            driver.execute_script("window.scrollBy(0, 1000);") 
            time.sleep(1)
    
    # Show's countries of origin
    origins = []
    try:
        og = WebDriverWait(details, 2).until(lambda x: x.find_elements(By.XPATH, './/li[@data-testid="title-details-origin"]//ul//a'))
        for o in og:
            origins.append(o.get_attribute('innerHTML'))
    except exceptions.TimeoutException:
        pass

    origins = ', '.join(origins) if origins else pd.NA

    # Show's languages
    languages = []
    try:
        lang = WebDriverWait(details, 2).until(lambda x: x.find_elements(By.XPATH, './/li[@data-testid="title-details-languages"]//ul//a'))
        for l in lang:
            languages.append(l.get_attribute('innerHTML'))
    except exceptions.TimeoutException:
        pass
    
    languages = ', '.join(languages) if languages else pd.NA

    # Show's productions companies
    productions = []
    try:
        prods = WebDriverWait(details, 2).until(lambda x: x.find_elements(By.XPATH, './/li[@data-testid="title-details-companies"]//ul//a'))
        for p in prods:
            productions.append(p.get_attribute('innerHTML'))
    except exceptions.TimeoutException:
        pass
    
    productions = ', '.join(productions) if productions else pd.NA

    # Close tab
    driver.close()
    
    return [title, years, certification, runtime, rating, votes, emmys, creators, actors, genres, origins, languages, productions, url]

In [None]:
# Set path Selenium
CHROMEDRIVER_PATH = '/usr/local/bin/chromedriver'
base = 'https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&num_votes=25000,&sort=user_rating,desc'

service = Service(executable_path=CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service)

driver.get(base)

# Continously scroll down and hit button to display all shows
while True:
    try:
        button = driver.find_element(By.XPATH, '//button[@class="ipc-btn ipc-btn--single-padding ipc-btn--center-align-content ipc-btn--default-height ipc-btn--core-base ipc-btn--theme-base ipc-btn--button-radius ipc-btn--on-accent2 ipc-text-button ipc-see-more__button"]')
        button.click()
    except exceptions.ElementClickInterceptedException:
        driver.execute_script("arguments[0].scrollIntoView();", button)
    except exceptions.NoSuchElementException:
        break

    time.sleep(1.5)

# Get list of shows
products = driver.find_elements(By.XPATH, value='//li[@class="ipc-metadata-list-summary-item"]')

# Get links to shows
links = get_links(products)

print(len(links))

# Write links to file
with open('links.txt', 'w') as f:
    f.write("\n".join(links))

driver.close()

1138


In [None]:
if __name__ == '__main__':
    coresNr = multiprocessing.cpu_count() // 2
    with Pool(coresNr) as p:
        results = p.map(get_shows_info, links)
        tvshows = [result for result in results]

    df = pd.DataFrame(tvshows, columns=['Title', 'Years', 'Certification', 'Runtime', 'Rating', 'Number of Votes', 'Emmys', 'Creators', 'Actors', 'Genres', 'Coutries of origins', 'Languages', 'Production companies', 'Link'])

In [27]:
df.to_csv('completeTvshows.csv', index=False, header=True)