### Web scraping - IMDB using BeautifulSoup

In [1]:
import pandas as pd
import requests  # used to requests URL
from bs4 import BeautifulSoup

### Request page source from URl

In [2]:
url='https://www.imdb.com/chart/top/'

In [3]:
page = requests.get(url)
page
# validating the page before moving on - 200 responses 

<Response [200]>

In [4]:
# display the page source code
page.content

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\n<script>\n    if (typeof uex == \'function\') {\n      uex("ld", "LoadTitle", {wb: 1});\n    }\n</script>\n\n    

In [6]:
soup=BeautifulSoup(page.content,'html.parser')
# depend on the page we should select the parser
print(soup.prettify()) #it remove the newline and display in the form of html

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/chart/top/" rel="canonical"/>
  <meta content="http://www.imdb.com/chart

In [7]:
scraped_movie=soup.find_all('td',class_='titleColumn')
scraped_movie
# note id (td) and class in website itself

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>
 <span class="secondaryInfo">(1974)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>
 <span class="secondaryInfo">(2008)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">12 Angry Men</a>
 <span class="secondaryInfo">

In [9]:
#parse movie name
movies=[]
for movie in scraped_movie:
    movie=movie.get_text().replace('\n','') # replacing newline
    movie=movie.strip(' ') # to remove all white spaces, tabs and newline
    movies.append(movie)
movies

['1.      The Shawshank Redemption(1994)',
 '2.      The Godfather(1972)',
 '3.      The Godfather: Part II(1974)',
 '4.      The Dark Knight(2008)',
 '5.      12 Angry Men(1957)',
 "6.      Schindler's List(1993)",
 '7.      The Lord of the Rings: The Return of the King(2003)',
 '8.      Pulp Fiction(1994)',
 '9.      Il buono, il brutto, il cattivo(1966)',
 '10.      The Lord of the Rings: The Fellowship of the Ring(2001)',
 '11.      Fight Club(1999)',
 '12.      Forrest Gump(1994)',
 '13.      Inception(2010)',
 '14.      The Lord of the Rings: The Two Towers(2002)',
 '15.      Star Wars: Episode V - The Empire Strikes Back(1980)',
 '16.      The Matrix(1999)',
 '17.      Goodfellas(1990)',
 "18.      One Flew Over the Cuckoo's Nest(1975)",
 '19.      Shichinin no samurai(1954)',
 '20.      Se7en(1995)',
 '21.      The Silence of the Lambs(1991)',
 '22.      Cidade de Deus(2002)',
 '23.      La vita è bella(1997)',
 "24.      It's a Wonderful Life(1946)",
 '25.      Star Wars(1977)

In [10]:
# scrap rating for movies
scraped_ratings=soup.find_all('td',class_='ratingColumn imdbRating')
scraped_ratings

[<td class="ratingColumn imdbRating">
 <strong title="9.2 based on 2,485,724 user ratings">9.2</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.1 based on 1,716,069 user ratings">9.1</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 1,191,909 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 2,439,594 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 735,428 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,275,350 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,721,501 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 1,923,000 user ratings">8.8</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 722,503 user ratings">8.8</strong>
 </t

In [11]:
ratings=[]
for rating in scraped_ratings:
    rating=rating.get_text().replace('\n','')
    rating=rating.strip(' ')
    ratings.append(rating)
ratings

['9.2',
 '9.1',
 '9.0',
 '9.0',
 '8.9',
 '8.9',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',


### Storing Scarped Data

In [12]:
data=pd.DataFrame()
data['Movie Name'] = movies
data['Ratings']=ratings
data.head()

Unnamed: 0,Movie Name,Ratings
0,1. The Shawshank Redemption(1994),9.2
1,2. The Godfather(1972),9.1
2,3. The Godfather: Part II(1974),9.0
3,4. The Dark Knight(2008),9.0
4,5. 12 Angry Men(1957),8.9


In [13]:
data.to_csv('IMDB Top Movies.csv',index=False)

### Web Scraping - Country name using Selenium

In [14]:
from selenium import webdriver
import pandas as pd

#### Setting path for chrome Driver

In [18]:
path=r"C:\Users\rani\Downloads\chromedriver_win32\chromedriver.exe"

In [20]:
# set path for the driver
browser=webdriver.Chrome(executable_path=path)

  browser=webdriver.Chrome(executable_path=path)


In [21]:
url='http://www.scrapethissite.com/pages/simple/'

In [22]:
# open page in the chrome
browser.get(url) #using webdriver the url is loaded

#### Scrapping data

In [25]:
# get country names
country_list=browser.find_elements_by_xpath("//h3[@class='country-name']")
country_list

  country_list=browser.find_elements_by_xpath("//h3[@class='country-name']")


[<selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="5c53a456-447d-42a8-95e8-2563926833f1")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="3aa4f92d-528a-490b-ac49-7978b7c01136")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="74472352-c09f-4036-926a-79f320b4dc3e")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="1b1400d8-621a-4908-a6be-0f8551575d13")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="a701b0a0-e247-420e-8d5d-7c0a4bb14952")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="1c239c03-afbb-4757-9cdb-99ae8069a7eb")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="02bfe3be-1de4-4611-9ac6-72

In [28]:
#parse the data
countries=[]
for country in country_list:
    country=country.text
    countries.append(country)
countries

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina',
 'American Samoa',
 'Austria',
 'Australia',
 'Aruba',
 'Åland',
 'Azerbaijan',
 'Bosnia and Herzegovina',
 'Barbados',
 'Bangladesh',
 'Belgium',
 'Burkina Faso',
 'Bulgaria',
 'Bahrain',
 'Burundi',
 'Benin',
 'Saint Barthélemy',
 'Bermuda',
 'Brunei',
 'Bolivia',
 'Bonaire',
 'Brazil',
 'Bahamas',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Belarus',
 'Belize',
 'Canada',
 'Cocos [Keeling] Islands',
 'Democratic Republic of the Congo',
 'Central African Republic',
 'Republic of the Congo',
 'Switzerland',
 'Ivory Coast',
 'Cook Islands',
 'Chile',
 'Cameroon',
 'China',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Cape Verde',
 'Curacao',
 'Christmas Island',
 'Cyprus',
 'Czech Republic',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominica',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Estonia',
 'Egypt',
 'Western Sahara',
 'Eritrea',
 

In [30]:
# getting population of the country
population_list=browser.find_elements_by_class_name('country-population')
population_list

  population_list=browser.find_elements_by_class_name('country-population')


[<selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="77558bf8-54f9-4813-93e3-db9c08c30bd1")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="daa24755-ed5e-47ba-9c06-e39d584256c9")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="869dd8c9-1338-423a-9172-0255f5aa17ea")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="9d197745-8bdd-44a3-b73e-a673faece22f")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="cda52e8a-39a2-48cd-9932-135659ca982e")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="bbdea223-7763-4bc4-9bf1-424e54570753")>,
 <selenium.webdriver.remote.webelement.WebElement (session="cb747569e2b5180b516c93d3be7a3b37", element="e8268eea-5091-4f9e-95eb-9d

In [31]:
populations=[]
for population in population_list:
    population=population.text
    populations.append(population)
populations

['84000',
 '4975593',
 '29121286',
 '86754',
 '13254',
 '2986952',
 '2968000',
 '13068161',
 '0',
 '41343201',
 '57881',
 '8205000',
 '21515754',
 '71566',
 '26711',
 '8303512',
 '4590000',
 '285653',
 '156118464',
 '10403000',
 '16241811',
 '7148785',
 '738004',
 '9863117',
 '9056010',
 '8450',
 '65365',
 '395027',
 '9947418',
 '18012',
 '201103330',
 '301790',
 '699847',
 '0',
 '2029307',
 '9685000',
 '314522',
 '33679000',
 '628',
 '70916439',
 '4844927',
 '3039126',
 '7581000',
 '21058798',
 '21388',
 '16746491',
 '19294149',
 '1330044000',
 '47790000',
 '4516220',
 '11423000',
 '508659',
 '141766',
 '1500',
 '1102677',
 '10476000',
 '81802257',
 '740528',
 '5484000',
 '72813',
 '9823821',
 '34586184',
 '14790608',
 '1291170',
 '80471869',
 '273008',
 '5792984',
 '46505963',
 '88013491',
 '5244000',
 '875983',
 '2638',
 '107708',
 '48228',
 '64768389',
 '1545255',
 '62348447',
 '107818',
 '4630000',
 '195506',
 '65228',
 '24339838',
 '27884',
 '56375',
 '1593256',
 '10324025',
 '44

#### Dataset-country with population

In [32]:
country_data=pd.DataFrame()
country_data['Country_Name']=countries
country_data['Population']=populations
country_data.head()

Unnamed: 0,Country_Name,Population
0,Andorra,84000
1,United Arab Emirates,4975593
2,Afghanistan,29121286
3,Antigua and Barbuda,86754
4,Anguilla,13254


In [34]:
#saving data
country_data.to_csv('Country with Population.csv',index=False)

#### Closing webdriver

In [35]:
browser.quit()

In [36]:
#import time - use this when you are using this as a python file
# time.sleep --since the file is executed completely, execution need to be stopped when the page is loading

In [73]:
# finding the image
#find will scrap the first source that is availabe
img_tag=soup.find('div',{'class':'story1-img focuspoint'})
img_tag

<div class="story1-img focuspoint" data-focus-x="0" data-focus-y="0">
<img alt="The phase 3 trial findings indicate that Covaxin induces a robust antibody response with no severe vaccine-related adverse events or deaths reported among the trial participants. File image." class="media-object lazy adaptive placeholder" data-device-variant="LANDSCAPE~LANDSCAPE~LANDSCAPE" data-src-template="https://th.thgim.com/sci-tech/health/d6iy3h/article37451803.ece/BINARY/thumbnail/COVAXIN-VACCINE-DELHI" data-variant="LANDSCAPE" src="https://th.thgim.com/static/img/1x1_spacer.png" title="The phase 3 trial findings indicate that Covaxin induces a robust antibody response with no severe vaccine-related adverse events or deaths reported among the trial participants. File image."/> </div>

### Extracting Reviews from flipkart website - BeautifulScoup

In [1]:
import requests
from bs4 import BeautifulSoup

In [5]:
url='https://www.flipkart.com/asian-elasto-02-sports-shoes-women-running-girls-stylish-latest-design-new-fashion-casual-sneakers-ladies-lace-up-lightweight-peach-jogging-walking-gym-party/product-reviews/itm048e42e0d1702?pid=SHOFM75W4JHBHZFB&lid=LSTSHOFM75W4JHBHZFBI1OTAO&marketplace=FLIPKART'

In [6]:
page = requests.get(url)
page

<Response [200]>

In [7]:
page.text

'<!doctype html><html lang="en"><head><link href="https://rukminim1.flixcart.com" rel="preconnect"/><link rel="stylesheet" href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app.chunk.8a1772.css"/><meta http-equiv="Content-type" content="text/html; charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta property="fb:page_id" content="102988293558"/><meta property="fb:admins" content="658873552,624500995,100000233612389"/><meta name="robots" content="noodp"/><link rel="shortcut icon" href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico"/><link type="application/opensearchdescription+xml" rel="search" href="/osdd.xml?v=2"/><meta property="og:type" content="website"/><meta name="og_site_name" property="og:site_name" content="Flipkart.com"/><link rel="apple-touch-icon" sizes="57x57" href="/apple-touch-icon-57x57.png"/><link rel="apple-touch-icon" sizes="72x72" href="/apple-touch-icon-72x72.png"/><link rel="apple-

In [8]:
soup=BeautifulSoup(page.text,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://rukminim1.flixcart.com" rel="preconnect"/>
  <link href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app.chunk.8a1772.css" rel="stylesheet"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="102988293558" property="fb:page_id"/>
  <meta content="658873552,624500995,100000233612389" property="fb:admins"/>
  <meta content="noodp" name="robots"/>
  <link href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico" rel="shortcut icon"/>
  <link href="/osdd.xml?v=2" rel="search" type="application/opensearchdescription+xml"/>
  <meta content="website" property="og:type"/>
  <meta content="Flipkart.com" name="og_site_name" property="og:site_name"/>
  <link href="/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="/apple-touch-icon-72x72.png" rel="apple-tou

In [13]:
reviews=soup.find_all('div',class_='_6K-7Co')

In [14]:
reviews

[<div class="_6K-7Co">Fabulous..must buy, vry comfortable i lv it</div>,
 <div class="_6K-7Co">Gd Quality</div>,
 <div class="_6K-7Co">nice 🥰🥰</div>,
 <div class="_6K-7Co">Vry nice</div>,
 <div class="_6K-7Co">fabulous.. very comfortable</div>,
 <div class="_6K-7Co">Good and comfirtable</div>,
 <div class="_6K-7Co">NYC product... so comfortable.. I m happy with this</div>,
 <div class="_6K-7Co">great purchase.. fast delivery... loved the product</div>,
 <div class="_6K-7Co">nice shoes, comfortable,looks good.</div>,
 <div class="_6K-7Co">Nice😎</div>]

In [19]:
review=[]
for item in reviews:
    item=item.get_text().replace('\n','')
    item=item.strip(' ')
    review.append(item)

In [20]:
review

['Fabulous..must buy, vry comfortable i lv it',
 'Gd Quality',
 'nice 🥰🥰',
 'Vry nice',
 'fabulous.. very comfortable',
 'Good and comfirtable',
 'NYC product... so comfortable.. I m happy with this',
 'great purchase.. fast delivery... loved the product',
 'nice shoes, comfortable,looks good.',
 'Nice😎']

In [22]:
ratings=soup.find_all('div',class_='_3LWZlK _1BLPMq _3B8WaH')
ratings

[<div class="_3LWZlK _1BLPMq _3B8WaH">5</div>,
 <div class="_3LWZlK _1BLPMq _3B8WaH">5</div>,
 <div class="_3LWZlK _1BLPMq _3B8WaH">5<img class="_1wB99o" src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxMyIgaGVpZ2h0PSIxMiI+PHBhdGggZmlsbD0iI0ZGRiIgZD0iTTYuNSA5LjQzOWwtMy42NzQgMi4yMy45NC00LjI2LTMuMjEtMi44ODMgNC4yNTQtLjQwNEw2L

In [23]:
rating=[]
for item in ratings:
    item=item.get_text().replace('\n','')
    item=item.strip(' ')
    rating.append(item)

In [24]:
rating

['5', '5', '5', '5', '5', '4', '5', '5', '5', '4']