# Web Scraping project of IMDB top 1000 movies 

### Importing Required Libraries

In [1]:
import pandas as pd   #to create dataframe
import requests       #to send the request to the URL
from bs4 import BeautifulSoup #to get the content in the form of HTML
import numpy as np  # to count the values

### Declaring the Headers

In [2]:
headers = {"Accept-Language": "en-US,en;q=0.5"}

### Assigning the URL with variable name url

In [3]:
url = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=401&ref_=adv_nxt'

### Because this technique only works on information on a single page and we require numerous sites scraped, this entire procedure needs be run multiple times before merging into one.

### links to the multiple pages are provided below 


1-100
https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating

101-200
https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=101&ref_=adv_nxt

201-300
https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=201&ref_=adv_nxt

301-400
https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=301&ref_=adv_nxt

401-500
https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=401&ref_=adv_nxt

In [4]:
#request allow you to send HTTP request
source = requests.get(url)
soup = BeautifulSoup(source.content, 'html.parser')

### Creating an empty lists , so that we can append the values later 

In [5]:
movie_name = []
year = []
time = []
rating = []
metascore = []
votes = []
gross = []
description = []
director = []
stars = []

### Storing the meaningfull required data in the variable

In [6]:
movie_data = soup.findAll('div', attrs= {'class': 'lister-item mode-advanced'})

### Storing movie names in to variable 


In [7]:
for v in movie_data:
    name = v.h3.a.text
    movie_name.append(name)
print(movie_name[:11])

['Secrets & Lies', 'Twelve Monkeys', 'Dilwale Dulhania Le Jayenge', 'Chung Hing sam lam', 'Andaz Apna Apna', 'Scent of a Woman', 'Aladdin', 'JFK', 'Beauty and the Beast', 'Dances with Wolves', 'Rain Man']


### Storing Release year of the movie into variable year

In [8]:
for v in movie_data:
    year_of_release = v.h3.find('span', class_ = 'lister-item-year text-muted unbold').text.replace('(', '').replace(')', '')
    year.append(year_of_release)

### Storing the Runtime of Movie into a Variable

In [9]:
for v in movie_data:
    runtime = v.p.find('span', class_ = 'runtime').text.replace(' min', '')
    #removing 'min' using replace method 
    time.append(runtime)#appending time extarcted data into empty list

### Storing the rating of the movie into vaiable 

In [10]:
for v in movie_data:
    rate = v.find('div', class_ = 'inline-block ratings-imdb-rating').text.replace('\n', '')
    rating.append(rate)
    

### Storing the votes and gross into a Variable

In [11]:
for v in movie_data:
    meta  = v.find('span', class_ = 'metascore').text.replace(' ', '') if v.find('span', class_ = 'metascore') else '000'
    metascore.append(meta)
    #since, gross and votes have same attributes, that's why we had created a common variable and then used indexing
    

### Storing the collections into a variable 

In [12]:
for v in movie_data:
    value = v.find_all('span', attrs = {'name': 'nv'})
    
    vote = value[0].text.replace(',','')
    votes.append(vote)
    
    grosses = value[1].text.split('$')[-1].split('M')[0] if len(value) >1 else '000'
    gross.append(grosses)

###  Storing the Description of the movie into a Variable

In [13]:
for v in movie_data:
    describe = v.find_all('p', class_ = 'text-muted')
    description_ = describe[1].text.replace('\n', '') if len(describe) >1 else '*****'
    description.append(description_)

### Storing the Directors and Cast into two separate Variables 

In [14]:
for v in movie_data:
    cast = v.find("p", class_ = '')
    cast = cast.text.replace('\n', '').split('|')
    cast = [x.strip() for x in cast]
    cast = [cast[i].replace(j, "") for i,j in enumerate(["director:", "stars:"])]
    director.append(cast[0].split(':')[-1])
    stars.append([cast[1].split(':')[-1]])

## Creating a Dataframe 

In [15]:
df = pd.DataFrame({'Name of movie': movie_name, 'Year of release': year, 'Watchtime': time, 'Movie Rating': rating, 'Metascore': metascore, 'Votes': votes, 'Gross collection in millions': gross, 'Description': description, "Director": director, 'Star': stars},index=range(401,501))

### Converting Dataframe into a desired format i,e. Csv or Xlsx

In [16]:
#df.to_excel("100.xlsx")
#df.to_excel("200.xlsx")
#df.to_excel("300.xlsx")
#df.to_excel("400.xlsx")

### Reading the data using pandas read method 

In [17]:
df1=pd.read_excel('100.xlsx')
df2=pd.read_excel('200.xlsx')
df3=pd.read_excel('300.xlsx')
df4=pd.read_excel('400.xlsx')

### Combining the scraped files in to one using concatenate method

In [18]:
todo=[df1,df2,df3,df4]

In [19]:
final=pd.concat(todo,ignore_index=True)

### Adding Slno column to the DF

In [20]:
final.insert(0, "SLNO",range(1,401), True)

In [21]:
final

Unnamed: 0.1,SLNO,Unnamed: 0,Movie Name,Year of release,Run time,Movie Rating,Metascore,Votes,Gross collection in millions,Description,Director,Star
0,1,0,The Shawshank Redemption,1994,142,9.3,81,2646498,28.34,Two imprisoned men bond over a number of years...,Frank Darabont,"['Tim Robbins, Morgan Freeman, Bob Gunton, Wil..."
1,2,1,The Godfather,1972,175,9.2,100,1834453,134.97,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,"['Marlon Brando, Al Pacino, James Caan, Diane ..."
2,3,2,The Dark Knight,2008,152,9.0,84,2618406,534.86,When the menace known as the Joker wreaks havo...,Christopher Nolan,"['Christian Bale, Heath Ledger, Aaron Eckhart,..."
3,4,3,The Lord of the Rings: The Return of the King,2003,201,9.0,94,1822452,377.85,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"['Elijah Wood, Viggo Mortensen, Ian McKellen, ..."
4,5,4,Schindler's List,1993,195,9.0,94,1341386,96.90,"In German-occupied Poland during World War II,...",Steven Spielberg,"['Liam Neeson, Ralph Fiennes, Ben Kingsley, Ca..."
...,...,...,...,...,...,...,...,...,...,...,...,...
395,396,95,Pâfekuto burû,1997,81,8.0,67,75967,0.78,A pop singer gives up her career to become an ...,Satoshi Kon,"['Junko Iwao, Rica Matsumoto, Shinpachi Tsuji,..."
396,397,96,Central do Brasil,1998,110,8.0,80,39781,5.60,The emotive journey of a former schoolteacher ...,Walter Salles,"['Fernanda Montenegro, Vinícius de Oliveira, M..."
397,398,97,La leggenda del pianista sull'oceano,1998,169,8.0,58,64506,0.26,"A baby boy, discovered in 1900 on an ocean lin...",Giuseppe Tornatore,"['Tim Roth, Pruitt Taylor Vince, Mélanie Thier..."
398,399,98,"Crna macka, beli macor",1998,127,8.0,73,53984,0.35,Matko and his son Zare live on the banks of th...,Emir Kusturica,"[""Bajram Severdzan, Srdjan 'Zika' Todorovic, B..."


### Dropping the Unwanted column

In [22]:
final.drop(['Unnamed: 0'], axis=1,inplace=True)

In [23]:
final.to_excel('Imdb 400.xlsx')

In [24]:
final.to_csv('Imdb 400.csv')

### final is required end output and it contains the content of top 400 IMDB movies based on User ratings 

 >>THANK YOU 

In [25]:
final

Unnamed: 0,SLNO,Movie Name,Year of release,Run time,Movie Rating,Metascore,Votes,Gross collection in millions,Description,Director,Star
0,1,The Shawshank Redemption,1994,142,9.3,81,2646498,28.34,Two imprisoned men bond over a number of years...,Frank Darabont,"['Tim Robbins, Morgan Freeman, Bob Gunton, Wil..."
1,2,The Godfather,1972,175,9.2,100,1834453,134.97,The aging patriarch of an organized crime dyna...,Francis Ford Coppola,"['Marlon Brando, Al Pacino, James Caan, Diane ..."
2,3,The Dark Knight,2008,152,9.0,84,2618406,534.86,When the menace known as the Joker wreaks havo...,Christopher Nolan,"['Christian Bale, Heath Ledger, Aaron Eckhart,..."
3,4,The Lord of the Rings: The Return of the King,2003,201,9.0,94,1822452,377.85,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,"['Elijah Wood, Viggo Mortensen, Ian McKellen, ..."
4,5,Schindler's List,1993,195,9.0,94,1341386,96.90,"In German-occupied Poland during World War II,...",Steven Spielberg,"['Liam Neeson, Ralph Fiennes, Ben Kingsley, Ca..."
...,...,...,...,...,...,...,...,...,...,...,...
395,396,Pâfekuto burû,1997,81,8.0,67,75967,0.78,A pop singer gives up her career to become an ...,Satoshi Kon,"['Junko Iwao, Rica Matsumoto, Shinpachi Tsuji,..."
396,397,Central do Brasil,1998,110,8.0,80,39781,5.60,The emotive journey of a former schoolteacher ...,Walter Salles,"['Fernanda Montenegro, Vinícius de Oliveira, M..."
397,398,La leggenda del pianista sull'oceano,1998,169,8.0,58,64506,0.26,"A baby boy, discovered in 1900 on an ocean lin...",Giuseppe Tornatore,"['Tim Roth, Pruitt Taylor Vince, Mélanie Thier..."
398,399,"Crna macka, beli macor",1998,127,8.0,73,53984,0.35,Matko and his son Zare live on the banks of th...,Emir Kusturica,"[""Bajram Severdzan, Srdjan 'Zika' Todorovic, B..."
