# 1. Imports

In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import random
from langdetect import detect

# 2. Defining the movies list

In [2]:
# Create a function to scrape movies for a given year
def scrape_movies_for_year(year):
    # Scrape movies from a given year
    url = f'https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31'
    response = requests.get(url)

    if response.status_code == 200:
        # parses the HTML content of the IMDb page, making it easier to extract data
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Finds all the movies using the heather 3 from HTML and are located at this class
        movie_links = soup.find_all('h3', class_='lister-item-header')

        good_movie_urls = []
        bad_movie_urls = []

        for link in movie_links:
            title = link.a.text # Extracts title
            movie_url = 'https://www.imdb.com' + link.a['href'] # Extracts URL
            rating_container = link.find_next('div', class_='ratings-imdb-rating') # Convert rating to a float value, so we can compare using a condition.

            if rating_container:
                rating = float(rating_container.strong.text)
                if rating >= 6.0:
                    good_movie_urls.append(movie_url)
                else:
                    bad_movie_urls.append(movie_url)

        # Determine the number of movies to collect based on the minimum number of "Bad Movies"
        num_movies_to_collect = min(len(good_movie_urls), len(bad_movie_urls))

        good_movie_urls = good_movie_urls[:num_movies_to_collect]
        bad_movie_urls = bad_movie_urls[:num_movies_to_collect]

        return good_movie_urls, bad_movie_urls

    else:
        print(f'Request for {year} failed. Status code: {response.status_code}')
        return [], []

# Scrape movies for 2022 and 2023
good_movies_2022, bad_movies_2022 = scrape_movies_for_year(2022)
good_movies_2023, bad_movies_2023 = scrape_movies_for_year(2023)

# Combine the results
all_good_movies = good_movies_2022 + good_movies_2023
all_bad_movies = bad_movies_2022 + bad_movies_2023

In [3]:
all_good_movies

['https://www.imdb.com/title/tt10638522/',
 'https://www.imdb.com/title/tt13560574/',
 'https://www.imdb.com/title/tt18925334/',
 'https://www.imdb.com/title/tt11245972/',
 'https://www.imdb.com/title/tt15791034/',
 'https://www.imdb.com/title/tt5537002/',
 'https://www.imdb.com/title/tt18394190/',
 'https://www.imdb.com/title/tt15671028/',
 'https://www.imdb.com/title/tt21807222/',
 'https://www.imdb.com/title/tt15257160/',
 'https://www.imdb.com/title/tt17351924/',
 'https://www.imdb.com/title/tt13274016/',
 'https://www.imdb.com/title/tt5535276/',
 'https://www.imdb.com/title/tt5648882/',
 'https://www.imdb.com/title/tt11858890/',
 'https://www.imdb.com/title/tt11426232/']

In [4]:
all_bad_movies

['https://www.imdb.com/title/tt10665342/',
 'https://www.imdb.com/title/tt21307994/',
 'https://www.imdb.com/title/tt15978956/',
 'https://www.imdb.com/title/tt2180339/',
 'https://www.imdb.com/title/tt11755740/',
 'https://www.imdb.com/title/tt4589218/',
 'https://www.imdb.com/title/tt12921446/',
 'https://www.imdb.com/title/tt11951276/',
 'https://www.imdb.com/title/tt13287846/',
 'https://www.imdb.com/title/tt18363072/',
 'https://www.imdb.com/title/tt26787296/',
 'https://www.imdb.com/title/tt10160976/',
 'https://www.imdb.com/title/tt3291150/',
 'https://www.imdb.com/title/tt19623240/',
 'https://www.imdb.com/title/tt15744298/',
 'https://www.imdb.com/title/tt3427252/']

In [5]:
# Define the review clause
review_clause = 'reviews/'

# Modify the URLs in the all_good_movies list
all_good_movies = [url + review_clause for url in all_good_movies]

# Modify the URLs in the all_bad_movies list
all_bad_movies = [url + review_clause for url in all_bad_movies]

In [None]:
# # Selecting 5 movies at random
# random_good_movies = random.sample(all_good_movies, 5)
# random_good_movies

In [None]:
# # Selecting 5 movies at random
# random_bad_movies = random.sample(all_bad_movies, 5)
# random_bad_movies

# 3. Saving the comments from each chosen movie

In [7]:
def scrape_imdb_reviews(url):
    comments = []
    dates = []

    # Set up the Safari WebDriver
    driver = webdriver.Safari()

    try:
        driver.get(url)

        while True:
            # Wait for the "Load More" button to become clickable
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "ipl-load-more__button"))
            )

            # Click the "Load More" button
            load_more_button.click()

            # Wait for new reviews to load
            time.sleep(2)

            # Extract comments and dates
            review_containers = driver.find_elements(By.CLASS_NAME, "text.show-more__control")
            date_containers = driver.find_elements(By.CLASS_NAME, "review-date")

            for review, date in zip(review_containers, date_containers):
                comments.append(review.text.strip())
                dates.append(date.text.strip())

    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()

    return comments, dates

In [9]:
%%time

# Initialize lists to store all comments and dates
all_comments = []
all_dates = []

# Iterate through all movie URLs and scrape reviews
for movie_url in all_good_movies:
    comments, dates = scrape_imdb_reviews(movie_url)
    all_comments.extend(comments)
    all_dates.extend(dates)

# Create a DataFrame with all the reviews
data = {
    "comments": all_comments,
    "date": all_dates,
}

df_scrapped_good = pd.DataFrame(data)

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

CPU times: user 3min 19s, sys: 24.4 s, total: 3min 43s
Wall time: 30min 19s


In [10]:
# Converting date columns format
df_scrapped_good_movies = df_scrapped_good.drop_duplicates()\
    .assign(date=lambda x: pd.to_datetime(x.date, format='%d %B %Y'))

print('Amount of comments on movies with average rate higher than 6: ', df_scrapped_good_movies.shape[0])

Amount of comments on movies with average rate higher than 6:  7229


In [11]:
df_scrapped_good_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7229 entries, 0 to 138314
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   comments  7229 non-null   object        
 1   date      7229 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 169.4+ KB


In [19]:
%%time

# Initialize lists to store all comments and dates
all_comments = []
all_dates = []

# Iterate through all movie URLs and scrape reviews
for movie_url in all_bad_movies:
    comments, dates = scrape_imdb_reviews(movie_url)
    all_comments.extend(comments)
    all_dates.extend(dates)

# Create a DataFrame with all the reviews
data = {
    "comments": all_comments,
    "date": all_dates,
}

df_scrapped_bad = pd.DataFrame(data)

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

An error occurred: Message: 

CPU times: user 3min 50s, sys: 26.7 s, total: 4min 16s
Wall time: 38min 5s


In [20]:
df_scrapped_bad_movies = df_scrapped_bad.drop_duplicates()\
    .assign(date=lambda x: pd.to_datetime(x.date, format='%d %B %Y'))

print('Amount of comments on movies with average rate lower than 6: ', df_scrapped_bad_movies.shape[0])

Amount of comments on movies with average rate lower than 6:  7259


In [21]:
df_scrapped_bad_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7259 entries, 0 to 160563
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   comments  7259 non-null   object        
 1   date      7259 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 170.1+ KB


# 4. Detecting comment languages

In [13]:
def detect_language(comment):
    """Function to detect the language of a comment
    """
    try:
        return detect(comment)
    except:
        return 'unknown'  # Handle any errors during language detection

## 4.1. Good movies

In [14]:
# Apply the language detection function to the DataFrame
df_scrapped_good_movies['language'] = df_scrapped_good_movies['comments'].apply(detect_language)

In [17]:
df_scrapped_good_movies.language.value_counts()

language
en    7224
id       1
so       1
sl       1
pt       1
fr       1
Name: count, dtype: int64

In [18]:
df_scrapped_good_movies.query('language != "en"')

Unnamed: 0,comments,date,language
59217,"Bad acting, bad writing... its bad bbbbbbbjsus...",2022-01-14,id
64368,If you're a horror fan your atleast going to l...,2022-01-15,so
69917,Ma pis pe filmu vostru da mi ba banii inapoi 2...,2022-01-16,sl
85524,Panico 5 é um filme maravilhoso.Esse filme é i...,2022-01-28,pt
131060,MAGGIE BETTS - THE BURIAL - AMAZON PRIME - 202...,2023-10-31,fr


## 4.2. Bad movies

In [22]:
# Apply the language detection function to the DataFrame
df_scrapped_bad_movies['language'] = df_scrapped_bad_movies['comments'].apply(detect_language)

In [23]:
df_scrapped_bad_movies.language.value_counts()

language
en    7256
pt       2
sl       1
Name: count, dtype: int64

In [24]:
df_scrapped_bad_movies.query('language != "en"')

Unnamed: 0,comments,date,language
63859,Not goood.move very not good\njust move.garbag...,2022-05-28,sl
63863,De positivo neste filme apenas Ana de Armas e ...,2022-03-25,pt
92901,Violento e preciso...um verdadeiro massacre.Um...,2022-02-19,pt


# 5. Data export

In [25]:
# Exporting both dfs to csv
df_scrapped_good_movies.to_csv('Data/good_movie_reviews.csv', index=False)
df_scrapped_bad_movies.to_csv('Data/bad_movie_reviews.csv', index=False)