In [1]:
import time
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

base_url = 'https://www.imdb.com'
chrome_driver_path = r"D:\NMKHDL\DATA-SCIENCE\data-collection\chromedriver.exe"

In [5]:
def load_all_shows(driver, pause_time = 1, max_failures = 3):
    scroll_count = 0
    fail_count = 0
    while fail_count < max_failures:
        try:
            button = driver.find_element(By.XPATH, '//button[contains(@class, "ipc-see-more__button")]')
            driver.execute_script("arguments[0].click();", button)
            scroll_count += 1
            time.sleep(pause_time)
            fail_count = 0
        except exceptions.NoSuchElementException:
            fail_count += 1
            time.sleep(pause_time)
        except exceptions.ElementClickInterceptedException:
            print("Scroll or element interaction issue. Retrying...")
            time.sleep(pause_time)
    return scroll_count

def get_show_links(driver):
    try:
        links = driver.find_elements(By.XPATH, '//li[contains(@class, "ipc-metadata-list-summary-item")]')
        return [link.find_element(By.TAG_NAME, 'a').get_attribute('href') for link in links]
    except exceptions.NoSuchElementException:
        print("No show links found.")
        return []
    
def clean_links(links):
    return [link.split('?')[0] for link in links]

In [6]:
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service)

url = f'{base_url}/search/title/?title_type=tv_series,tv_miniseries&num_votes=5000,&sort=user_rating,desc'
driver.get(url)
total_scrolls = load_all_shows(driver)
print("Total scrolls:", total_scrolls)

links = get_show_links(driver)
clean_links_list = clean_links(links)
print("Total links:", len(clean_links_list))

filename = 'imdb_links.txt'
with open(filename, 'w') as f:
    f.write("\n".join(clean_links_list))
print("Links saved to", filename)

driver.quit()

Total scrolls: 80
Total links: 3980
Links saved to imdb_links.txt


In [2]:
import re
import csv
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

In [11]:
url = 'https://www.imdb.com/title/tt0903747/'

def load_user_agents(filename):
    with open(filename, 'r') as file:
        return [line.strip() for line in file.readlines()]
    
user_agents = load_user_agents('user_agents.txt')

response = requests.get(url, headers={'User-Agent': random.choice(user_agents)})
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')

title_div = soup.select_one('.sc-70a366cc-0.bxYZmb')
title = title_div.find('h1', {'data-testid': 'hero__pageTitle'}).get_text(strip=True)
original_title = title_div.get_text(strip=True)[16:] if title_div.find('div', class_='sc-ec65ba05-1 fUCCIx') else None
print("Title:", title)
print("Original Title:", original_title)

details_list = title_div.find('ul', class_='ipc-inline-list ipc-inline-list--show-dividers sc-ec65ba05-2 joVhBE baseAlt')
details_list = details_list.find_all('li')
series_type = details_list[0].get_text(strip=True) if len(details_list) > 0 else None
release_year = details_list[1].get_text(strip=True) if len(details_list) > 1 else None
certificate = details_list[2].get_text(strip=True) if len(details_list) > 2 else None
runtime = details_list[3].get_text(strip=True) if len(details_list) > 3 else None
print(f"Series Type: {series_type}")
print(f"Release Year: {release_year}")
print(f"Certificate: {certificate}")
print(f"Runtime: {runtime}")

rating_div = soup.find('div', class_='sc-3a4309f8-1 dOjKRs')
rating = rating_div.find('span', class_='sc-d541859f-1 imUuxf').get_text(strip=True)
num_ratings = rating_div.find('div', class_='sc-d541859f-3 dwhNqC').get_text(strip=True)
popularity_score = rating_div.find('div', class_='sc-39d285cf-1 dxqvqi')
popularity_score = popularity_score.get_text(strip=True) if popularity_score else None
popularity_delta = rating_div.find('div', class_='sc-39d285cf-2 bHiRqw')
popularity_delta = popularity_delta.get_text(strip=True) if popularity_delta else None
if rating_div.find('svg', class_='ipc-icon--arrow-drop-up'):
    popularity_delta = f"+{popularity_delta}"
elif rating_div.find('svg', class_='ipc-icon--arrow-drop-down'):
    popularity_delta = f"-{popularity_delta}"
print(f"IMDb Rating: {rating}")
print(f"Number of Ratings: {num_ratings}")
print(f"Popularity: {popularity_score}")
print(f"Popularity Delta: {popularity_delta}")

description_div = soup.select_one('.sc-9a2a0028-10.iUfJXd')
genres_text = description_div.find('div', class_='ipc-chip-list__scroller')
genres = [genre.get_text(strip=True) for genre in genres_text.find_all('span')]
creators_text = description_div.find('div', class_='ipc-metadata-list-item__content-container')
creators = [creator.get_text(strip=True) for creator in creators_text.find_all('a')]
stars_text = description_div.find_all('div', class_='ipc-metadata-list-item__content-container')[1]
stars = [star.get_text(strip=True) for star in stars_text.find_all('a')]
print("Genres:", genres)
print("Creators:", creators)
print("Stars:", stars)

reviews_div = soup.select_one('.sc-9a2a0028-11.ketnsO')
reviews_list = reviews_div.find('ul', class_='ipc-inline-list sc-b782214c-0 bllRjU baseAlt')
user_reviews = reviews_list.find_all('li')[0].find('span', class_='score').get_text(strip=True)
critic_reviews = reviews_list.find_all('li')[1].find('span', class_='score').get_text(strip=True)
print("User Reviews:", user_reviews)
print("Critic Reviews:", critic_reviews)

# awards = soup.select_one('.sc-710dd9d1-0.evrEcj')
tmp= soup.find('div', class_='ipc-page-content-container')
awards = tmp.find('section', class_='ipc-page-section')
print(awards)

# awards = soup.select_one('.sc-aa5ab255-0.evrEcj').find('span').get_text(strip=True)
# print("Awards:", awards)

# seasons = soup.select_one('.sc-8592ce7f-4.xDejQ').find('label').get_text(strip=True)
# print("Seasons:", seasons)

# episodes = soup.select_one('.ipc-title.ipc-title--base').find_all('span')[1].get_text(strip=True)
# print("Episodes:", episodes)
# details_div = soup.find('div', {'data-testid': 'title-details-section'}).find_all('li')
# for detail in details_div:
#     tmp = detail.find('li', class_='ipc-inline-list__item')
#     print(tmp)

# df = pd.DataFrame(tvshows, columns=['Title', 'Years', 'Certification', 'Runtime', 'Rating', 'Number of Votes', 'Emmys', 'Creators', 'Actors', 'Genres', 'Coutries of origins', 'Languages', 'Production companies', 'Link'])



Title: Tập Làm Người Xấu
Original Title: uOriginal title: Breaking BadTV Series2008–2013C1845m
Series Type: TV Series
Release Year: 2008–2013
Certificate: C18
Runtime: 45m
IMDb Rating: 9.5
Number of Ratings: 2.3M
Popularity: 25
Popularity Delta: -2
Genres: ['Desert Adventure', 'Drug Crime', 'Epic', 'Psychological Drama', 'Psychological Thriller', 'Tragedy', 'Crime', 'Drama', 'Thriller']
Creators: ['Vince Gilligan']
Stars: ['Bryan Cranston', 'Aaron Paul', 'Anna Gunn']
User Reviews: 5.1K
Critic Reviews: 176
None
