# WebScraping IMDB for the Top rated 250 movies

## Information to scrape from the website

- movie title
- rank
- year 
- rating

In [183]:
#Import the necessary libararies for scraping.

from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
import openpyxl
import csv

In [3]:
#create a variable and do a request on the url.

source = requests.get('https://www.imdb.com/chart/top/')

In [6]:
#run status_code to make sure the response is successful
#200-299 is the successful range.

source.raise_for_status()
source.status_code

200

In [92]:
#used beautiful soup to parse the html text from website.

soup = BeautifulSoup(source.text, 'html.parser')

In [93]:
#each of the movie details are within these tags.

movies = soup.find('tbody', {'class': "lister-list"}).find_all('tr')

In [94]:
#We can see that 250 movies have been found as shown in the url

len(movies)

250

In [99]:
#time to find the name of the movie
#movie title is in the 'a' tag so we did a '.a' to narrow it down.

name = movie.find('td', {'class': 'titleColumn'}).a
print(name)

<a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>


In [97]:
#to get the text attribute from the tag we did a '.text'.

name = movie.find('td', {'class': 'titleColumn'}).a.text
print(name)

The Shawshank Redemption


In [98]:
#rank is getting movie rank
#.get_text and strip to format it correctly

#rank = movie.find('td', {'class': 'titleColumn'}).get_text(strip=True)
print(rank)

1


In [75]:
#add a .split to separate the 1, .split puts it into a list, so we can now get what we want by using indexes
#index of [0] to get only the number.

rank = movie.find('td', {'class': 'titleColumn'}).get_text(strip=True).split('.')[0]

print(rank)

1


In [53]:
#now to get the year

year = movie.find('td', {'class': 'titleColumn'}).span.text.strip('()')
print(year)

1994


In [58]:
#now for rating
#this time we have to look a little further into the td tag for the rating
#class is a little different from title column.

rating = movie.find('td', {'class': 'ratingColumn imdbRating'}).strong.text
#print(rating)

9.2


In [184]:
#create excel workbook

excel = openpyxl.Workbook()
print(excel.sheetnames)

['Sheet']


In [185]:
#rename excel sheet and make active

sheet = excel.active
sheet.title = 'Top250 movies IMDB'
print(excel.sheetnames)

['Top250 movies IMDB']


In [186]:
#add and name the columns as you want

sheet.append(['Movie Rank', 'Movie Name', 'Year of Release', 'IMDB Rating'])

In [204]:
#Put all the code into a try and except block for any errors

try:
    source = requests.get('https://www.imdb.com/chart/top/')
    source.raise_for_status()
    
    soup = BeautifulSoup(source.text, 'html.parser')
    
    #lister-list has all of the 250 movies in
    #we do a find_all on the 'tr' tag because it has all of the details for EACH movie.
    movies = soup.find('tbody', {'class': "lister-list"}).find_all('tr')
    
    
    #created a loop to loop through the 'td' tag and print out the move title tag
    #movie title tag is 'a', that's why we added '.a' after.
    for movie in movies:
        
        name = movie.find('td', {'class': 'titleColumn'}).a.text
        
        rank = movie.find('td', {'class': 'titleColumn'}).get_text(strip=True).split('.')[0]
        
        year = movie.find('td', {'class': 'titleColumn'}).span.text.strip('()')
        
        rating = movie.find('td', {'class': 'ratingColumn imdbRating'}).strong.text
        
        print(rank, name, year, rating)
        
        #add in the sheet with columns
        sheet.append([rank, name, year, rating])
    
except Exception as e:
    print(e)
    
#save as csv or xlsx
excel.save('IMDB Top 250 movies test.csv')

1 The Shawshank Redemption 1994 9.2
2 The Godfather 1972 9.1
3 The Godfather: Part II 1974 9.0
4 The Dark Knight 2008 9.0
5 12 Angry Men 1957 8.9
6 Schindler's List 1993 8.9
7 The Lord of the Rings: The Return of the King 2003 8.9
8 Pulp Fiction 1994 8.8
9 The Good, the Bad and the Ugly 1966 8.8
10 The Lord of the Rings: The Fellowship of the Ring 2001 8.8
11 Fight Club 1999 8.7
12 Forrest Gump 1994 8.7
13 Inception 2010 8.7
14 The Lord of the Rings: The Two Towers 2002 8.7
15 Star Wars: Episode V - The Empire Strikes Back 1980 8.7
16 The Matrix 1999 8.7
17 Goodfellas 1990 8.6
18 One Flew Over the Cuckoo's Nest 1975 8.6
19 Seven Samurai 1954 8.6
20 Se7en 1995 8.6
21 The Silence of the Lambs 1991 8.6
22 City of God 2002 8.6
23 Life Is Beautiful 1997 8.6
24 It's a Wonderful Life 1946 8.6
25 Saving Private Ryan 1998 8.6
26 Star Wars 1977 8.5
27 Interstellar 2014 8.5
28 Spirited Away 2001 8.5
29 The Green Mile 1999 8.5
30 Spider-Man: No Way Home 2021 8.5
31 Parasite 2019 8.5
32 Léon: The P

In [205]:
#imported excel file into a Dataframe.

movie_df = pd.read_csv(r'.\IMDB Top 250 movies.csv')

In [206]:
movie_df

Unnamed: 0,Movie Rank,Movie Name,Year of Release,IMDB Rating
0,1,The Shawshank Redemption,1994,9.2
1,2,The Godfather,1972,9.1
2,3,The Godfather: Part II,1974,9.0
3,4,The Dark Knight,2008,9.0
4,5,12 Angry Men,1957,8.9
...,...,...,...,...
245,246,Miracle in Cell No. 7,2019,8.0
246,247,Fanny and Alexander,1982,8.0
247,248,Hera Pheri,2000,8.0
248,249,Nights of Cabiria,1957,8.0
