In [46]:
def scroll_down():
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Scroll to bottom
        time.sleep(2)  # Wait for new content to load
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:  # If height hasn't changed, stop scrolling
            break
        last_height = new_height

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Setup WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Open the target website
driver.get("https://www.imdb.com/")
driver.maximize_window()


try:
    driver.execute_script("window.scrollBy(0, 600);") 
    time.sleep(3)
    driver.execute_script("window.scrollBy(0, 600);") 
    time.sleep(3)
    driver.execute_script("window.scrollBy(0, 600);") 
    time.sleep(3)
    driver.execute_script("window.scrollBy(0, 600);") 
    # Wait until movie elements load
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, "//a[@class='ipc-poster-card__title ipc-poster-card__title--clamp-2 ipc-poster-card__title--clickable']/span"))
    )

    # Extract all movie names
    movie_elements = driver.find_elements(By.XPATH, "//a[@class='ipc-poster-card__title ipc-poster-card__title--clamp-2 ipc-poster-card__title--clickable']/span")
    movies = [movie.text for movie in movie_elements if movie.text.strip()]

    # Debugging: Print page source if empty
    if not movies:
        print("No movies found. Check the page source:\n")
        print(driver.page_source[:1000])  # Print first 1000 characters of page source

except Exception as e:
    print("Error:", e)
    movies = []

# Close the browser
driver.quit()

# Print extracted movie names
print("Extracted Movie Names:", movies)

Extracted Movie Names: ['1. Adolescence', '2. The White Lotus', '3. Severance', '4. The Electric State', '5. Invincible', '6. Snow White', '7. Daredevil: Born Again', '8. Reacher', '9. The Wheel of Time', '10. Anora', 'Baida', 'Rekhachithram', 'Adolescence', 'Dragon', 'The Diplomat', 'Chhaava', 'Officer on Duty', 'Solo Leveling', 'Artiste', 'Am Ah', 'Crazxy', 'Anora', 'Sky Force', 'Ponman', 'Daredevil: Born Again', 'Dupahiya', 'Gandhi Tatha Chettu', 'Interstellar', 'Attack on Titan', 'Mandya', 'Severance', 'Breaking Bad', 'Power of Paanch', 'Dragon Ball Daima', 'Niram Marum Ulagil', 'Solo Leveling: ReAwakening', 'Kaushaljis vs Kaushal', 'Halka', 'Invincible', 'Dabba Cartel']


In [8]:
from imdb import IMDb

def get_imdb_review_link(movie_name):
    ia = IMDb()
    movies = ia.search_movie(movie_name)

    if movies:
        movie_id = movies[0].movieID
        review_url = f"https://www.imdb.com/title/tt{movie_id}/reviews"
        return review_url
    else:
        return "Movie not found on IMDb."
review_link = []
for movie_name in movies:
    review_link.append(get_imdb_review_link(movie_name))
print("IMDb Review Page:", review_link)

IMDb Review Page: ['https://www.imdb.com/title/tt31806037/reviews', 'https://www.imdb.com/title/tt13406094/reviews', 'https://www.imdb.com/title/tt11280740/reviews', 'https://www.imdb.com/title/tt7766378/reviews', 'https://www.imdb.com/title/tt6741278/reviews', 'https://www.imdb.com/title/tt6208148/reviews', 'https://www.imdb.com/title/tt18923754/reviews', 'https://www.imdb.com/title/tt9288030/reviews', 'https://www.imdb.com/title/tt7462410/reviews', 'https://www.imdb.com/title/tt28607951/reviews', 'https://www.imdb.com/title/tt35217961/reviews', 'https://www.imdb.com/title/tt32284154/reviews', 'https://www.imdb.com/title/tt31806037/reviews', 'https://www.imdb.com/title/tt32080876/reviews', 'https://www.imdb.com/title/tt26229612/reviews', 'https://www.imdb.com/title/tt27922706/reviews', 'https://www.imdb.com/title/tt34388152/reviews', 'https://www.imdb.com/title/tt21209876/reviews', 'https://www.imdb.com/title/tt36106718/reviews', 'https://www.imdb.com/title/tt34852931/reviews', 'https

In [10]:
import csv

csv_filename = "imdb_review_links.csv"

# Save to CSV file
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Review Links"])  # Header
    for link in review_link:
        writer.writerow([link])  # Write each link as a row

print(f"CSV file '{csv_filename}' has been saved successfully!")


CSV file 'imdb_review_links.csv' has been saved successfully!


In [57]:
csv_filename = "imdb_review_links.csv"
review_links = []

with open(csv_filename, mode="r", encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header
    for row in reader:
        review_links.append(row[0])

In [59]:
def scroll_until_button_visible(xpath):
    while True:
        try:
            # Check if the button is visible
            button = WebDriverWait(driver, 2).until(
                EC.visibility_of_element_located((By.XPATH, xpath))
            )
            print("Button is now visible, stopping scroll.")
            break  # Stop scrolling once button is found
        except:
            # If button is not found, scroll down
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)

In [61]:
def extract_reviews_and_ratings():
    articles = driver.find_elements(By.XPATH, "//section[1]/article")  # Get all review articles
    reviews_data = []

    for article in articles:
        try:
            # Extract review title
            h3_element = article.find_element(By.XPATH, ".//div[1]/div[1]/div[2]/div/a/h3")
            review_title = h3_element.text.strip()
        except:
            review_title = "No Title"

        try:
            # Extract rating (if exists)
            rating_element = article.find_element(By.XPATH, ".//div[1]/div[1]/div[1]/span/span[1]")
            rating = rating_element.text.strip()
        except:
            rating = "No Rating"

        # Append extracted data
        reviews_data.append({"Review Title": review_title, "Rating": rating})

    return reviews_data

In [75]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

# Setup WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# List to store all review data
all_reviews = []

for link in review_links:
    driver.get(link)
    driver.maximize_window()

    # button_xpath = "/html/body/div[2]/main/div/section/div/section/div/div[1]/section[1]/div[3]/div/span[2]/button"
    # scroll_until_button_visible(button_xpath)
    time.sleep(5)
    input("Click enter for confirmation")
    time.sleep(3)
    scroll_down()

    # Extract reviews and append to list
    reviews = extract_reviews_and_ratings()
    all_reviews.extend(reviews)  # Append extracted reviews to the list

# Convert list to DataFrame
df = pd.DataFrame(all_reviews, columns=["Review Title", "Rating"])

# Save to CSV
df.to_csv("imdb_reviews.csv", index=False)
print(df)

# Close WebDriver
driver.quit()


Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 
Click enter for confirmation 


                                        Review Title     Rating
0     I liked it for the same reason people hated it          8
1                          A Film Made By Its Ending          9
2                                           No Title  No Rating
3                      Pretty Woman meets Uncut Gems          9
4                            A review from a Russian          8
...                                              ...        ...
4802         Terrible storyline, factually incorrect          1
4803                        Bad Bad Bad. Don't watch          1
4804         Dabba Cartel - A Thrilling Binge-Watch!          4
4805                             Not what I expected          4
4806                                         Concept         10

[4807 rows x 2 columns]


In [79]:
import pandas as pd

df1 = pd.read_csv("imdb_reviews1.csv")
df2 = pd.read_csv("imdb_reviews.csv")

merged_df = pd.concat([df1, df2], ignore_index=True)  # Merge and reset index
merged_df.to_csv("imdb_final.csv", index=False)  # Save the merged file

In [101]:
df = pd.read_csv("imdb_final.csv")

In [103]:
df.shape

(11019, 2)

In [105]:
df['Rating'].replace('No Rating', pd.NA, inplace=True)
df.dropna(subset=['Rating'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].replace('No Rating', pd.NA, inplace=True)


In [111]:
df['Rating'] = df['Rating'].astype(int)
df['Sentiment'] = df['Rating'].apply(lambda x: 'Positive' if x >= 7 else 
                                     'Neutral' if 4 <= x <= 6 else 
                                     'Negative')

In [117]:
df['Sentiment'].value_counts()

Sentiment
Positive    7756
Neutral     1617
Negative    1353
Name: count, dtype: int64

In [119]:
df.to_csv("imdb_reviews_with_classes.csv", index=False)