## Import

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from imdb import IMDb

## Example of bs4 Usage

In [2]:
url = "https://www.imdb.com/chart/top/"

In [3]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

### Request the page

In [4]:
page = requests.get(url, headers=HEADERS)
page

<Response [200]>

### Getting the page content

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

I'm trying to scrap the title and rank of top 250 Movies from IMDb chart

On the HTML element the title was on <\h3> class = ipc-title__text

In [6]:
scraped_movies = soup.find_all('h3', class_='ipc-title__text')
scraped_movies

[<h3 class="ipc-title__text">IMDb Charts</h3>,
 <h3 class="ipc-title__text">1. The Shawshank Redemption</h3>,
 <h3 class="ipc-title__text">2. The Godfather</h3>,
 <h3 class="ipc-title__text">3. The Dark Knight</h3>,
 <h3 class="ipc-title__text">4. The Godfather: Part II</h3>,
 <h3 class="ipc-title__text">5. 12 Angry Men</h3>,
 <h3 class="ipc-title__text">6. Schindler's List</h3>,
 <h3 class="ipc-title__text">7. The Lord of the Rings: The Return of the King</h3>,
 <h3 class="ipc-title__text">8. Pulp Fiction</h3>,
 <h3 class="ipc-title__text">9. The Lord of the Rings: The Fellowship of the Ring</h3>,
 <h3 class="ipc-title__text">10. The Good, the Bad and the Ugly</h3>,
 <h3 class="ipc-title__text">11. Forrest Gump</h3>,
 <h3 class="ipc-title__text">12. Dune: Part Two</h3>,
 <h3 class="ipc-title__text">13. The Lord of the Rings: The Two Towers</h3>,
 <h3 class="ipc-title__text">14. Fight Club</h3>,
 <h3 class="ipc-title__text">15. Inception</h3>,
 <h3 class="ipc-title__text">16. Star Wars

## Get IMDB movie ID based on movie title

In [7]:
def get_imdb_id(movie_title):
    # Create an instance of the IMDb class
    ia = IMDb()

    # Search for a movie by title
    results = ia.search_movie(movie_title)

    # Return the IMDb ID of the first result, if available
    if results:
        return results[0].movieID
    else:
        return None

In [8]:
# Example usage
movie_title = "Avengers: Endgame"
imdb_id = get_imdb_id(movie_title)
if imdb_id:
    print(f"IMDb ID for '{movie_title}': {imdb_id}")
else:
    print(f"No results found for '{movie_title}'")

IMDb ID for 'Avengers: Endgame': 4154796


## Scrape 25 Reviews sorted by most Total Votes

In [9]:
def scrape_imdb_reviews(imdb_id):
    # Construct the URL for the movie's reviews page
    url = f"https://www.imdb.com/title/tt{imdb_id}/reviews?sort=totalVotes"

    # Send a GET request to the URL
    response = requests.get(url)
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all review elements on the page
    reviews = soup.find_all("div", class_="text show-more__control")

    # Extract the text of each review
    review_texts = [review.get_text() for review in reviews]

    return review_texts

In [10]:
# Example usage
imdb_id = get_imdb_id("Avengers: Endgame")
reviews = scrape_imdb_reviews(imdb_id)

# Print the first few reviews
for i, review in enumerate(reviews[:5], 1):
    print(f"Review {i}:")
    print(review)
    print()

Review 1:
I have to say, my first reaction walking out of the cinema was that it was great. Probably an 8/10. You know there's so much fan service in this movie and I particularly loved the "I can do this all day" CA line from one CA to another. It was almost Toy Story 2, enlightened Buzz to naive Buzz banter.I loved Clever Hulk, found Thor hilarious, though a bit annoying at times, and loved the references to past movies. Cap swinging Mjollnir around was beautiful.The deaths in this movie were also pretty surprising but I agree with some of the more balanced reviews in here that it was bizarre that so much time was spent on Hawkeye - does anyone really care about him? Not really.Killing off Black Widow was a surprising touch, but I, like many others, probably felt more of an emotional reaction to Banner's relationship with her and not Hawkeye (because no one actually cares about him). Renner, as an actor, just doesn't cut it much, unfortunately.The real problem with End Game, is the t

## Scrape the movie basic information

In [54]:
def scrape_movie_details(imdb_id):
    # Construct the URL for the movie's IMDb page
    url = f"https://www.imdb.com/title/{imdb_id}/"
    summaries_url = f"https://www.imdb.com/title/{imdb_id}/plotsummary/"

    # Send a GET request to the URL
    response = requests.get(url, headers=HEADERS)
    response_sum = requests.get(summaries_url, headers=HEADERS)
    

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    soup_sum = BeautifulSoup(response_sum.content, "html.parser")

    # Extract movie details
    details = {}

    # Get title
    title_elem = soup.find('span', class_='hero__primary-text')
    details['title'] = title_elem.text.strip() if title_elem else None

    # Get summaries
    summaries = soup_sum.find('div', 'ipc-html-content-inner-div')
    details['summaries'] = summaries.text.strip() if summaries else None

    
    # Get overall rating
    rating = soup.find('span', class_='sc-bde20123-1 cMEQkK')
    details['rating'] = rating.text.strip() if rating else None

    # Get genres
    genres = soup.find_all("span", class_='ipc-chip__text')
#     details['genres'] = [genre.text for genre in genres[:-1]]
    details['genres'] = ', '.join([genre.text for genre in genres[:-1]])

    # Get release year
    release_year = soup.select_one('a[href*="releaseinfo"]:not([aria-label]):not([class*="icon-link"])')
    details['release_year'] = release_year.text.strip() if release_year else None

     # Get runtime
    runtime = soup.find('li', string=lambda text: text and 'h' in text and 'm' in text)
    details['runtime'] = runtime.text.strip() if runtime else None

    return details

In [56]:
# Example usage
imdb_id = "tt4154796"
movie_details = scrape_movie_details(imdb_id)

print("Movie Details:")
print(f"Title: {movie_details['title']}")
print(f"Summaries: {movie_details['summaries']}")
print(f"Rating: {movie_details['rating']}")
# print(f"Genres: {', '.join(movie_details['genres'])}")
print(f"Genres: {movie_details['genres']}")
print(f"Release Year: {movie_details['release_year']}")
print(f"Runtime: {movie_details['runtime']}")

Movie Details:
Title: Avengers: Endgame
Summaries: After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.
Rating: 8.4
Genres: Action, Adventure, Drama
Release Year: 2019
Runtime: 3h 1m


In [57]:
# Create a list to hold the movie details
movie_details_list = []

# Append the movie details to the list
movie_details_list.append(movie_details)

# Create a DataFrame from the list
movie_df = pd.DataFrame(movie_details_list)

In [58]:
movie_df

Unnamed: 0,title,summaries,rating,genres,release_year,runtime
0,Avengers: Endgame,After the devastating events of Avengers: Infi...,8.4,"Action, Adventure, Drama",2019,3h 1m


## Trial and error

In [44]:
url = 'https://www.imdb.com/title/tt0111161/plotsummary/'

# Send a GET request to the URL
response = requests.get(url, headers=HEADERS)

soup = BeautifulSoup(response.content, "html.parser")

In [45]:
storyline_elem = soup.find('div', 'ipc-html-content-inner-div')
storyline_elem

<div class="ipc-html-content-inner-div">Over the course of several years, two convicts form a friendship, seeking consolation and, eventually, redemption through basic compassion.</div>

In [113]:
storyline_elem.text.strip()

'2019'