# 1. Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random

# 2. Defining the movies list

In [2]:
# Create a function to scrape movies for a given year
def scrape_movies_for_year(year):
    # Scrape movies from a given year
    url = f'https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31'
    response = requests.get(url)

    if response.status_code == 200:
        # parses the HTML content of the IMDb page, making it easier to extract data
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Finds all the movies using the heather 3 from HTML and are located at this class
        movie_links = soup.find_all('h3', class_='lister-item-header')

        good_movie_urls = []
        bad_movie_urls = []

        for link in movie_links:
            title = link.a.text # Extracts title
            movie_url = 'https://www.imdb.com' + link.a['href'] # Extracts URL
            rating_container = link.find_next('div', class_='ratings-imdb-rating') # Convert rating to a float value, so we can compare using a condition.

            if rating_container:
                rating = float(rating_container.strong.text)
                if rating >= 6.0:
                    good_movie_urls.append(movie_url)
                else:
                    bad_movie_urls.append(movie_url)

        # Determine the number of movies to collect based on the minimum number of "Bad Movies"
        num_movies_to_collect = min(len(good_movie_urls), len(bad_movie_urls))

        good_movie_urls = good_movie_urls[:num_movies_to_collect]
        bad_movie_urls = bad_movie_urls[:num_movies_to_collect]

        return good_movie_urls, bad_movie_urls

    else:
        print(f'Request for {year} failed. Status code: {response.status_code}')
        return [], []

# Scrape movies for 2022 and 2023
good_movies_2022, bad_movies_2022 = scrape_movies_for_year(2022)
good_movies_2023, bad_movies_2023 = scrape_movies_for_year(2023)

# Combine the results
all_good_movies = good_movies_2022 + good_movies_2023
all_bad_movies = bad_movies_2022 + bad_movies_2023

In [3]:
all_good_movies

['https://www.imdb.com/title/tt10638522/',
 'https://www.imdb.com/title/tt13560574/',
 'https://www.imdb.com/title/tt18925334/',
 'https://www.imdb.com/title/tt11245972/',
 'https://www.imdb.com/title/tt15791034/',
 'https://www.imdb.com/title/tt5537002/',
 'https://www.imdb.com/title/tt18394190/',
 'https://www.imdb.com/title/tt15671028/',
 'https://www.imdb.com/title/tt21807222/',
 'https://www.imdb.com/title/tt15257160/',
 'https://www.imdb.com/title/tt17351924/',
 'https://www.imdb.com/title/tt13274016/',
 'https://www.imdb.com/title/tt5535276/',
 'https://www.imdb.com/title/tt5648882/',
 'https://www.imdb.com/title/tt11858890/',
 'https://www.imdb.com/title/tt11426232/']

In [4]:
all_bad_movies

['https://www.imdb.com/title/tt10665342/',
 'https://www.imdb.com/title/tt21307994/',
 'https://www.imdb.com/title/tt15978956/',
 'https://www.imdb.com/title/tt2180339/',
 'https://www.imdb.com/title/tt11755740/',
 'https://www.imdb.com/title/tt4589218/',
 'https://www.imdb.com/title/tt12921446/',
 'https://www.imdb.com/title/tt11951276/',
 'https://www.imdb.com/title/tt13287846/',
 'https://www.imdb.com/title/tt18363072/',
 'https://www.imdb.com/title/tt26787296/',
 'https://www.imdb.com/title/tt10160976/',
 'https://www.imdb.com/title/tt3291150/',
 'https://www.imdb.com/title/tt19623240/',
 'https://www.imdb.com/title/tt15744298/',
 'https://www.imdb.com/title/tt3427252/']

In [7]:
# Define the review clause
review_clause = 'reviews/'

# Modify the URLs in the all_good_movies list
all_good_movies = [url + review_clause for url in all_good_movies]

# Modify the URLs in the all_bad_movies list
all_bad_movies = [url + review_clause for url in all_bad_movies]

In [18]:
# Selecting 5 movies at random
random_good_movies = random.sample(all_good_movies, 5)
random_good_movies

['https://www.imdb.com/title/tt15257160/reviews/',
 'https://www.imdb.com/title/tt13560574/reviews/',
 'https://www.imdb.com/title/tt15791034/reviews/',
 'https://www.imdb.com/title/tt10638522/reviews/',
 'https://www.imdb.com/title/tt21807222/reviews/']

In [13]:
# Selecting 5 movies at random
random_bad_movies = random.sample(all_bad_movies, 5)
random_bad_movies

['https://www.imdb.com/title/tt26787296/reviews/',
 'https://www.imdb.com/title/tt10160976/reviews/',
 'https://www.imdb.com/title/tt21307994/reviews/',
 'https://www.imdb.com/title/tt4589218/reviews/',
 'https://www.imdb.com/title/tt11755740/reviews/']

# 3. Saving the comments from each chosen movie

In [20]:
def scrape_imdb_reviews(url):
    comments = []
    dates = []

    # Set up the Safari WebDriver
    driver = webdriver.Safari()

    try:
        driver.get(url)

        while True:
            # Wait for the "Load More" button to become clickable
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "ipl-load-more__button"))
            )

            # Click the "Load More" button
            load_more_button.click()

            # Wait for new reviews to load
            time.sleep(2)

            # Extract comments and dates
            review_containers = driver.find_elements(By.CLASS_NAME, "text.show-more__control")
            date_containers = driver.find_elements(By.CLASS_NAME, "review-date")

            for review, date in zip(review_containers, date_containers):
                comments.append(review.text.strip())
                dates.append(date.text.strip())

    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()

    return comments, dates

In [21]:
# Initialize lists to store all comments and dates
all_comments = []
all_dates = []

# Iterate through all movie URLs and scrape reviews
for movie_url in random_good_movies:
    comments, dates = scrape_imdb_reviews(movie_url)
    all_comments.extend(comments)
    all_dates.extend(dates)

# Create a DataFrame with all the reviews
data = {
    "comments": all_comments,
    "date": all_dates,
}

df_scrapped_good = pd.DataFrame(data)

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 



In [22]:
# Initialize lists to store all comments and dates
all_comments = []
all_dates = []

# Iterate through all movie URLs and scrape reviews
for movie_url in random_bad_movies:
    comments, dates = scrape_imdb_reviews(movie_url)
    all_comments.extend(comments)
    all_dates.extend(dates)

# Create a DataFrame with all the reviews
data = {
    "comments": all_comments,
    "date": all_dates,
}

df_scrapped_bad = pd.DataFrame(data)

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 



In [24]:
# Converting date columns format
df_scrapped_good_movies = df_scrapped_good.drop_duplicates()\
    .assign(date=lambda x: pd.to_datetime(x.date, format='%d %B %Y'))

print('Amount of comments on movies with average rate higher than 6: ', df_scrapped_good_movies.shape[0])

df_scrapped_bad_movies = df_scrapped_bad.drop_duplicates()\
    .assign(date=lambda x: pd.to_datetime(x.date, format='%d %B %Y'))

print('Amount of comments on movies with average rate lower than 6: ', df_scrapped_bad_movies.shape[0])

Amount of comments on movies with average rate higher than 6:  2950
Amount of comments on movies with average rate lower than 6:  3570


In [27]:
df_scrapped_good_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2950 entries, 0 to 52373
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   comments  2950 non-null   object        
 1   date      2950 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 69.1+ KB


In [28]:
df_scrapped_bad_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3570 entries, 0 to 87330
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   comments  3570 non-null   object        
 1   date      3570 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 83.7+ KB


In [29]:
df_scrapped_good_movies.date.min(), df_scrapped_good_movies.date.max()

(Timestamp('2022-03-16 00:00:00'), Timestamp('2023-11-02 00:00:00'))

In [30]:
df_scrapped_bad_movies.date.min(), df_scrapped_bad_movies.date.max()

(Timestamp('2022-02-18 00:00:00'), Timestamp('2023-11-02 00:00:00'))

# 4. Data export

In [34]:
# Exporting both dfs to csv
df_scrapped_good_movies.to_csv('Data/good_movie_reviews.csv', index=False)
df_scrapped_bad_movies.to_csv('Data/bad_movie_reviews.csv', index=False)