In [8]:
# Scraping multiple pages of data from the IMDb website, in a single script, to fetch top 1000 movies data, especially the:
    
# 1. Movie Name 
# 2. Release Year
# 3. Watch Time 
# 4. IMDb Rating
# 5. Metascore
# 6. Votes
# 7. Gross Collection
# 8. Description

In [10]:
import pandas as pd    #to create dataframe
import numpy as np     # to count the values (in our case)
import requests        #to send the HTTP request to the URL
from bs4 import BeautifulSoup  #to get the content in the form of HTML
from time import sleep
from random import randint

In [11]:
# Declaring the headers 
headers = {"Accept-Language": "en-US,en;q=0.5"}

In [12]:
# Declaring the list of empty variables, so that we can append the data overall.
# You can customise according to need. You can add the star's name, or the director's name.

movie_name = []
year = []
time=[]
rating=[]
metascore =[]
votes = []
gross = []
description = []

In [13]:
#creating an array of values and passing it in the url for dynamic webpages.
pages = np.arange(1,1000,100)

# Important to analyze which part of URL is static and which part is dynamic: which part will change from link tolink.

In [15]:
#the whole core of the script
for page in pages:
    page = requests.get("https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start="+str(page)+"&ref_=adv_nxt")
    soup = BeautifulSoup(page.text, 'html.parser')
    movie_data = soup.findAll('div', attrs = {'class': 'lister-item mode-advanced'})
    sleep(randint(2,8))
    for store in movie_data:
        name = store.h3.a.text
        movie_name.append(name)
        
        year_of_release = store.h3.find('span', class_ = "lister-item-year text-muted unbold").text
        year.append(year_of_release)
        
        runtime = store.p.find("span", class_ = 'runtime').text
        time.append(runtime)
        
        rate = store.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '')
        rating.append(rate)
        
        meta = store.find('span', class_ = "metascore").text if store.find('span', class_ = "metascore") else "****"
        metascore.append(meta)
        
        
        value = store.find_all('span', attrs = {'name': "nv"})
        
        vote = value[0].text
        votes.append(vote)
        
        grosses = value[1].text if len(value)>1 else '%^%^%^'
        gross.append(grosses)
        
        describe = store.find_all('p', class_ = 'text-muted')
        description_ = describe[1].text.replace('\n', '') if len(describe) >1 else '*****'
        description.append(description_)

In [16]:
# creating a dataframe 
movie_list = pd.DataFrame({ "Movie Name": movie_name, "Year of Release": year, "Watch Time": time, "Movie Rating": rating,
                           "Metascore of movie": metascore, "Votes": votes, "Gross":gross, "Description": description})  

In [17]:
movie_list

Unnamed: 0,Movie Name,Year of Release,Watch Time,Movie Rating,Metascore of movie,Votes,Gross,Description
0,Jai Bhim,(2021),164 min,9.4,****,184400,%^%^%^,When a tribal man is arrested for a case of al...
1,Soorarai Pottru,(2020),153 min,9.3,****,109095,%^%^%^,"Nedumaaran Rajangam ""Maara"" sets out to make t..."
2,The Shawshank Redemption,(1994),142 min,9.3,81,2568848,$28.34M,Two imprisoned men bond over a number of years...
3,The Godfather,(1972),175 min,9.2,100,1768572,$134.97M,The aging patriarch of an organized crime dyna...
4,The Dark Knight,(2008),152 min,9.1,84,2533779,$534.86M,When the menace known as the Joker wreaks havo...
...,...,...,...,...,...,...,...,...
995,Les yeux sans visage,(1960),90 min,7.6,90,30833,$0.05M,A surgeon causes an accident which leaves his ...
996,Sabrina,(1954),113 min,7.6,72,63789,%^%^%^,A playboy becomes interested in the daughter o...
997,From Here to Eternity,(1953),118 min,7.6,85,46309,$30.50M,"At a U.S. Army base in 1941 Hawaii, a private ..."
998,Snow White and the Seven Dwarfs,(1937),83 min,7.6,95,194461,$184.93M,Exiled into the dangerous forest by her wicked...


In [18]:
# saving the data in excel format
movie_list.to_excel("Top 1000 IMDb movies.xlsx")

#If you want to save the data in csv format
movie_list.to_csv("Top 1000 IMDb movies.csv")