# Importing libraries

In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# In this section we are going to use request and Beautifulsoup libarary to scrape top 250 movies of IMDb

### Url is the Website page link from where we have to scrape the data

In [32]:
url = "https://www.imdb.com/chart/top/"

## Sending the HTTp request to url server and then storing the response object in response variable

In [33]:
response = requests.get(url)

Let's see what we got into response object. The response object is basically the html file which in response server has send back to us

In [82]:
# print(response.text)

### Now the above format is very messy so let's try to use our BeautifulSoup library to parse this object into proper suitable html format

In [37]:
soup = BeautifulSoup(response.text, 'html.parser')

In [83]:
# print(soup.prettify())

## Let's try to find the titles of movies on the website

In [63]:
titles = soup.find_all("td", {"class" :"titleColumn"})

In [79]:
movie_title = []
for title in titles:
    movie_title.append(title.text)

### Now let's try to find reviews of movies on the website

In [65]:
reviews = soup.find_all("td", {"class" :"ratingColumn imdbRating"})

In [80]:
movie_reviews = []
for review in reviews:
    movie_reviews.append(review.text)

### Storing the data into Pandas DataFrame

In [67]:
dictionary = {"Movie Name" : movie_title,
              "Movie Reviews" : movie_reviews}

In [68]:
df = pd.DataFrame(dictionary)

In [70]:
df

Unnamed: 0,Movie Name,Movie Reviews
0,\n 1.\n The Shawshank Redemption\n(1...,\n9.2\n
1,\n 2.\n The Godfather\n(1972)\n,\n9.2\n
2,\n 3.\n The Dark Knight\n(2008)\n,\n9.0\n
3,\n 4.\n The Godfather Part II\n(1974)\n,\n9.0\n
4,\n 5.\n 12 Angry Men\n(1957)\n,\n9.0\n
...,...,...
245,\n 246.\n The Iron Giant\n(1999)\n,\n8.0\n
246,\n 247.\n Aladdin\n(1992)\n,\n8.0\n
247,\n 248.\n The Help\n(2011)\n,\n8.0\n
248,\n 249.\n Gandhi\n(1982)\n,\n8.0\n


### In above DataFrame unwanted \n is coming up so let's try to clean a dataset little bit

In [74]:
df.replace("\n","", regex=True, inplace=True)

In [75]:
df

Unnamed: 0,Movie Name,Movie Reviews
0,1. The Shawshank Redemption(1994),9.2
1,2. The Godfather(1972),9.2
2,3. The Dark Knight(2008),9.0
3,4. The Godfather Part II(1974),9.0
4,5. 12 Angry Men(1957),9.0
...,...,...
245,246. The Iron Giant(1999),8.0
246,247. Aladdin(1992),8.0
247,248. The Help(2011),8.0
248,249. Gandhi(1982),8.0


### Now once the data is in data frame we can store it anywhere or in any format however we want i'll leave this upto you for sake of completeness i am storing it as an excel file

In [81]:
df.to_excel("Top 250 movies on IMDb.xlsx", index=False)