In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
import pandas as pd

In [3]:
#Gather information about the most popular 100 movies from IMDB
url = "https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html')

In [4]:
#Gather the list section from the HTML that contains information about the movies
listOfMovies = soup.find('ul', class_='ipc-metadata-list ipc-metadata-list--dividers-between sc-a1e81754-0 eBRbsI compact-list-view ipc-metadata-list--base')

In [5]:
#Convert the HTML markup to a list
listOfMovies = listOfMovies.find_all('li', class_='ipc-metadata-list-summary-item sc-10233bc-0 iherUv cli-parent')

In [6]:
#Form a table of the movie titles and links to their individual pages
headings = ['Title', 'Link']
tableOfMovies = pd.DataFrame(columns = headings)

for movie in listOfMovies:
    title = movie.find('h3').text.strip()
    title = title.lstrip('1234567890. ') 
    link = movie.find('a')
    link = "https://imdb.com" + link.get('href')
    movieData = [title, link]

    lengthOfTableOfMovies = len(tableOfMovies)
    tableOfMovies.loc[lengthOfTableOfMovies] = movieData

In [7]:
#Form the data frame with full information about the movies
headings = ["Title", "Year", "Genre", "Age_Rating", "Runtime", "Director", "Stars", "Plot_Synopsis", "IMDb_Rating", "Metascore"]
df = pd.DataFrame(columns = headings)

for index, movie in tableOfMovies.iterrows():
    link = movie['Link']
    page = requests.get(link, headers=headers)
    soup = BeautifulSoup(page.text, 'html')

    try:
        releaseInformation = soup.find('ul', class_='ipc-inline-list ipc-inline-list--show-dividers sc-d8941411-2 cdJsTz baseAlt')
        releaseInformation = releaseInformation.find_all('li', class_='ipc-inline-list__item')
        year = releaseInformation[0].text.strip()
        ageRating = releaseInformation[1].text.strip()
        runtime = releaseInformation[2].text.strip()
    except Exception as e:
        year = ""
        runtime = ""
        runtime = ""

    try:
        listOfGenres = soup.find('div', attrs={'data-testid': 'genres'})
        listOfGenres = listOfGenres.find_all('span', class_='ipc-chip__text')
        genres = ""
        firstGenre = True
        for genre in listOfGenres:
            if firstGenre:
                genres = genres + genre.text.strip()
                firstGenre = False
            else:
                genres = genres + "/" + genre.text.strip()
    except Exception as e:
        genres = ""

    try:
        IMDbRating = soup.find('div', attrs={'data-testid': 'hero-rating-bar__aggregate-rating__score'})
        IMDbRating = IMDbRating.find('span', class_='sc-bde20123-1 cMEQkK').text.strip()
    except Exception as e:
        IMDbRating= ""

    try:
        productionInformation = soup.find('div', class_='sc-b7c53eda-3 vXcqY')
        productionInformation = productionInformation.find_all('li', class_='ipc-metadata-list__item')
        director = productionInformation[0].find('a', class_='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link').text.strip()
        listOfStars = productionInformation[2].find('ul', class_='ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content baseAlt')
        listOfStars = listOfStars.find_all('li', class_='ipc-inline-list__item')
        stars = ""
        firstStar = True
        for star in listOfStars:
            if firstStar:
                stars = stars + star.text.strip()
                firstStar = False
            else:
                stars = stars + "/" + star.text.strip()
    except Exception as e:
        director = ""
        stars = ""

    try:
        plotSynopsis = soup.find('p', attrs={'data-testid': 'plot'})
        plotSynopsis = plotSynopsis.find('span').text.strip()
    except Exception as e:
        plotSynopsis= ""

    try:
        metaScore = soup.find('span', class_='sc-b0901df4-0 bcQdDJ metacritic-score-box').text.strip()
    except Exception as e:
        metaScore = ""

    movieData = [movie['Title'], year, genres, ageRating, runtime, director, stars, plotSynopsis, IMDbRating, metaScore]
    print("Now processing: ", movieData)
    
    length = len(df)
    df.loc[length] = movieData

In [8]:
df

Unnamed: 0,Title,Year,Genre,Age_Rating,Runtime,Director,Stars,Plot_Synopsis,IMDb_Rating,Metascore
0,Civil War,2024,Action/Thriller,R,1h 49m,Alex Garland,Kirsten Dunst/Wagner Moura/Cailee Spaeny,"A journey across a dystopian future America, f...",7.6,75
1,Dune: Part Two,2024,Action/Adventure/Drama,PG-13,2h 46m,Denis Villeneuve,Timothée Chalamet/Zendaya/Rebecca Ferguson,Paul Atreides unites with Chani and the Fremen...,8.7,79
2,Joker: Folie à Deux,,Crime/Drama/Musical,R,,Todd Phillips,Zazie Beetz/Joaquin Phoenix/Lady Gaga,"Sequel to the film ""Joker"" from 2019.",,
3,Monkey Man,2024,Action/Thriller,R,2h 1m,Dev Patel,Dev Patel/Sharlto Copley/Pitobash,An anonymous young man unleashes a campaign of...,7.3,70
4,Godzilla x Kong: The New Empire,2024,Action/Adventure/Sci-Fi,PG-13,1h 55m,Adam Wingard,Rebecca Hall/Brian Tyree Henry/Dan Stevens,"Two ancient titans, Godzilla and Kong, clash i...",6.5,47
...,...,...,...,...,...,...,...,...,...,...
95,Música,2024,Comedy/Music/Romance,PG-13,1h 31m,Rudy Mancuso,Rudy Mancuso/Camila Mendes/Francesca Reale,A coming-of-age love story that follows an asp...,6.5,74
96,Snack Shack,2024,Comedy,R,1h 52m,Adam Rehmeier,Conor Sherry/Gabriel LaBelle/Mika Abdalla,"Nebraska City, 1991, two best friends get the ...",7.0,64
97,The Time Travelers,1964,Sci-Fi,Approved,1h 24m,Ib Melchior,Preston Foster/Philip Carey/Merry Anders,"In 1964, a group of scientists create a portal...",5.2,
98,Top Gun: Maverick,2022,Action/Drama,PG-13,2h 10m,Joseph Kosinski,Tom Cruise/Jennifer Connelly/Miles Teller,"After thirty years, Maverick is still pushing ...",8.2,78


In [9]:
#Export it
df.to_csv('MostPopularIMDbMovies.csv', index=False)