# Web Scraping

### 1. Write a python program to display all the header tags from ‘en.wikipedia.org/wiki/Main_Page’.

In [1]:
# import required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# send a request to wegpage server for get a data.
page = requests.get("https://en.wikipedia.org/wiki/Main_Page")
page

<Response [200]>

In [3]:
page.content

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Wikipedia, the free encyclopedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d7c15476-57ce-479b-84f3-5bb94893ae0b","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Main_Page","wgTitle":"Main Page","wgCurRevisionId":1004593520,"wgRevisionId":1004593520,"wgArticleId":15580374,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Main_Page","wgRelevantArticleId":15580374,"wgIsProbablyEditable":!1,"wgRelev

In [4]:
soup = BeautifulSoup(page.content,"html.parser")
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d7c15476-57ce-479b-84f3-5bb94893ae0b","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Main_Page","wgTitle":"Main Page","wgCurRevisionId":1004593520,"wgRevisionId":1004593520,"wgArticleId":15580374,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Main_Page","wgRelevantArticleId":15580374,"wgIsProbablyEditable":!1,"wgRelevantPag

In [5]:
headers = soup.find_all('span',class_="mw-headline")
headers

[<span class="mw-headline" id="From_today's_featured_article">From today's featured article</span>,
 <span class="mw-headline" id="Did_you_know_...">Did you know ...</span>,
 <span class="mw-headline" id="In_the_news">In the news</span>,
 <span class="mw-headline" id="On_this_day">On this day</span>,
 <span class="mw-headline" id="From_today's_featured_list">From today's featured list</span>,
 <span class="mw-headline" id="Today's_featured_picture">Today's featured picture</span>,
 <span class="mw-headline" id="Other_areas_of_Wikipedia">Other areas of Wikipedia</span>,
 <span class="mw-headline" id="Wikipedia's_sister_projects">Wikipedia's sister projects</span>,
 <span class="mw-headline" id="Wikipedia_languages">Wikipedia languages</span>]

In [6]:
# Now, we extract the text
titles = []
for i in headers:
    titles.append(i.text)
titles

["From today's featured article",
 'Did you know\xa0...',
 'In the news',
 'On this day',
 "From today's featured list",
 "Today's featured picture",
 'Other areas of Wikipedia',
 "Wikipedia's sister projects",
 'Wikipedia languages']

In [7]:
titles = []
for i in headers:
    titles.append(i.text.replace("\xa0",""))
titles

["From today's featured article",
 'Did you know...',
 'In the news',
 'On this day',
 "From today's featured list",
 "Today's featured picture",
 'Other areas of Wikipedia',
 "Wikipedia's sister projects",
 'Wikipedia languages']

In [8]:
df = pd.DataFrame({})
df['headers'] = titles
df

Unnamed: 0,headers
0,From today's featured article
1,Did you know...
2,In the news
3,On this day
4,From today's featured list
5,Today's featured picture
6,Other areas of Wikipedia
7,Wikipedia's sister projects
8,Wikipedia languages


#### 2. Write a python program to display IMDB’s Top rated 100 movies’ data (i.e. Name, IMDB rating, Year of release) and save it in form of a CSV file.

In [9]:
page = requests.get("https://www.imdb.com/chart/top")
page

<Response [200]>

In [10]:
page.content

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n            <style>\n                body#styleguide-v2 {\n                    background: no-repeat fixed center top #000;\n                }\n            </style>\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>IMDb Top 250 - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_tit

In [11]:
soup = BeautifulSoup(page.content,"html.parser")
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<style>
                body#styleguide-v2 {
                    background: no-repeat fixed center top #000;
                }
            </style>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>IMDb Top 250 - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>


In [12]:
movie_name = soup.find_all('td',class_='titleColumn')
movie_name [0:100]

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>
 <span class="secondaryInfo">(1974)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>
 <span class="secondaryInfo">(2008)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">12 Angry Men</a>
 <span class="secondaryInfo">

In [13]:
Movie_names = []
for x in movie_name[0:100]:
    for y in x.find_all('a'):
        Movie_names.append(y.text)
Movie_names

['The Shawshank Redemption',
 'The Godfather',
 'The Godfather: Part II',
 'The Dark Knight',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'Il buono, il brutto, il cattivo',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Fight Club',
 'Forrest Gump',
 'Inception',
 'The Lord of the Rings: The Two Towers',
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Matrix',
 'Goodfellas',
 "One Flew Over the Cuckoo's Nest",
 'Shichinin no samurai',
 'Se7en',
 'La vita è bella',
 'Cidade de Deus',
 'The Silence of the Lambs',
 "It's a Wonderful Life",
 'Star Wars',
 'Saving Private Ryan',
 'The Green Mile',
 'Sen to Chihiro no kamikakushi',
 'Interstellar',
 'Gisaengchung',
 'Léon',
 'Seppuku',
 'The Usual Suspects',
 'The Lion King',
 'The Pianist',
 'Terminator 2: Judgment Day',
 'Back to the Future',
 'American History X',
 'Modern Times',
 'Gladiator',
 'Psycho',
 'The Departed',
 'City Lights',
 'The Intouchables',
 

In [14]:
IMDB_rating = soup.find_all('td', class_='ratingColumn imdbRating')
IMDB_rating [0:100]

[<td class="ratingColumn imdbRating">
 <strong title="9.2 based on 2,365,025 user ratings">9.2</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.1 based on 1,637,371 user ratings">9.1</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 1,140,667 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 2,325,844 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 697,619 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,223,415 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,656,188 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 1,842,970 user ratings">8.8</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 693,972 user ratings">8.8</strong>
 </t

In [15]:
imdb_rating=[]
for j in IMDB_rating[0:100]:
    imdb_rating.append(j.text.replace('\n',""))
imdb_rating

['9.2',
 '9.1',
 '9.0',
 '9.0',
 '8.9',
 '8.9',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.2']

In [16]:
year_of_release = soup.find_all('span',class_='secondaryInfo')
year_of_release

[<span class="secondaryInfo">(1994)</span>,
 <span class="secondaryInfo">(1972)</span>,
 <span class="secondaryInfo">(1974)</span>,
 <span class="secondaryInfo">(2008)</span>,
 <span class="secondaryInfo">(1957)</span>,
 <span class="secondaryInfo">(1993)</span>,
 <span class="secondaryInfo">(2003)</span>,
 <span class="secondaryInfo">(1994)</span>,
 <span class="secondaryInfo">(1966)</span>,
 <span class="secondaryInfo">(2001)</span>,
 <span class="secondaryInfo">(1999)</span>,
 <span class="secondaryInfo">(1994)</span>,
 <span class="secondaryInfo">(2010)</span>,
 <span class="secondaryInfo">(2002)</span>,
 <span class="secondaryInfo">(1980)</span>,
 <span class="secondaryInfo">(1999)</span>,
 <span class="secondaryInfo">(1990)</span>,
 <span class="secondaryInfo">(1975)</span>,
 <span class="secondaryInfo">(1954)</span>,
 <span class="secondaryInfo">(1995)</span>,
 <span class="secondaryInfo">(1997)</span>,
 <span class="secondaryInfo">(2002)</span>,
 <span class="secondaryInfo">(19

In [17]:
Year_of_release=[]
for k in year_of_release[0:100]:
    Year_of_release.append(k.text.replace("(",""))
Year_of_release

['1994)',
 '1972)',
 '1974)',
 '2008)',
 '1957)',
 '1993)',
 '2003)',
 '1994)',
 '1966)',
 '2001)',
 '1999)',
 '1994)',
 '2010)',
 '2002)',
 '1980)',
 '1999)',
 '1990)',
 '1975)',
 '1954)',
 '1995)',
 '1997)',
 '2002)',
 '1991)',
 '1946)',
 '1977)',
 '1998)',
 '1999)',
 '2001)',
 '2014)',
 '2019)',
 '1994)',
 '1962)',
 '1995)',
 '1994)',
 '2002)',
 '1991)',
 '1985)',
 '1998)',
 '1936)',
 '2000)',
 '1960)',
 '2006)',
 '1931)',
 '2011)',
 '2014)',
 '1988)',
 '2006)',
 '1968)',
 '1942)',
 '1988)',
 '1954)',
 '1979)',
 '1979)',
 '2000)',
 '1940)',
 '1981)',
 '2012)',
 '2006)',
 '2020)',
 '1957)',
 '2019)',
 '2008)',
 '1980)',
 '2018)',
 '1950)',
 '1957)',
 '2003)',
 '2018)',
 '1997)',
 '1964)',
 '2012)',
 '1984)',
 '2016)',
 '1986)',
 '2017)',
 '2019)',
 '2018)',
 '1999)',
 '1995)',
 '1963)',
 '1981)',
 '1995)',
 '2009)',
 '1984)',
 '2009)',
 '1997)',
 '1983)',
 '2007)',
 '1992)',
 '1968)',
 '2000)',
 '2012)',
 '1958)',
 '1931)',
 '2004)',
 '1941)',
 '2016)',
 '1952)',
 '1948)',
 '1921)']

In [18]:
Years_of_release=[]
for j in Year_of_release:
    Years_of_release.append(j.replace(")",""))
Years_of_release

['1994',
 '1972',
 '1974',
 '2008',
 '1957',
 '1993',
 '2003',
 '1994',
 '1966',
 '2001',
 '1999',
 '1994',
 '2010',
 '2002',
 '1980',
 '1999',
 '1990',
 '1975',
 '1954',
 '1995',
 '1997',
 '2002',
 '1991',
 '1946',
 '1977',
 '1998',
 '1999',
 '2001',
 '2014',
 '2019',
 '1994',
 '1962',
 '1995',
 '1994',
 '2002',
 '1991',
 '1985',
 '1998',
 '1936',
 '2000',
 '1960',
 '2006',
 '1931',
 '2011',
 '2014',
 '1988',
 '2006',
 '1968',
 '1942',
 '1988',
 '1954',
 '1979',
 '1979',
 '2000',
 '1940',
 '1981',
 '2012',
 '2006',
 '2020',
 '1957',
 '2019',
 '2008',
 '1980',
 '2018',
 '1950',
 '1957',
 '2003',
 '2018',
 '1997',
 '1964',
 '2012',
 '1984',
 '2016',
 '1986',
 '2017',
 '2019',
 '2018',
 '1999',
 '1995',
 '1963',
 '1981',
 '1995',
 '2009',
 '1984',
 '2009',
 '1997',
 '1983',
 '2007',
 '1992',
 '1968',
 '2000',
 '2012',
 '1958',
 '1931',
 '2004',
 '1941',
 '2016',
 '1952',
 '1948',
 '1921']

In [19]:
df1 = pd.DataFrame({})
df1['Rank'] = list(range(1,101))
df1['movie_titles'] = Movie_names
df1['imdb_ratings'] = imdb_rating
df1['year'] = Years_of_release
df1

Unnamed: 0,Rank,movie_titles,imdb_ratings,year
0,1,The Shawshank Redemption,9.2,1994
1,2,The Godfather,9.1,1972
2,3,The Godfather: Part II,9.0,1974
3,4,The Dark Knight,9.0,2008
4,5,12 Angry Men,8.9,1957
...,...,...,...,...
95,96,Citizen Kane,8.3,1941
96,97,Dangal,8.3,2016
97,98,Singin' in the Rain,8.2,1952
98,99,Ladri di biciclette,8.2,1948


In [20]:
# Save data in csv file
IMDB_movies = pd.DataFrame(df1)
IMDB_movies.to_csv("imdb_top_100_movies.csv")

#### 3. Write a python program to display IMDB’s Top rated 100 Indian movies’ data (i.e. Name, IMDB rating, Year of release) and save it in form of a CSV file.

In [21]:
page = requests.get("https://www.imdb.com/india/top-rated-indian-movies/")
page

<Response [200]>

In [22]:
page.content

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n            <style>\n                body#styleguide-v2 {\n                    background: no-repeat fixed center top #000;\n                }\n            </style>\n        <style>\n            body#styleguide-v2 #root {\n                box-shadow: none;\n            }\n        </style>\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);

In [23]:
soup = BeautifulSoup(page.content,'html.parser')
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<style>
                body#styleguide-v2 {
                    background: no-repeat fixed center top #000;
                }
            </style>
<style>
            body#styleguide-v2 #root {
                box-shadow: none;
            }
        </style>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Top Rated Indian Movies - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); }

In [24]:
movie_name = soup.find_all('td',class_='titleColumn')
movie_name [0:100]

[<td class="titleColumn">
       1.
       <a href="/title/tt0048473/" title="Satyajit Ray (dir.), Kanu Bannerjee, Karuna Bannerjee">Pather Panchali</a>
 <span class="secondaryInfo">(1955)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0079221/" title="Hrishikesh Mukherjee (dir.), Amol Palekar, Bindiya Goswami">Gol Maal</a>
 <span class="secondaryInfo">(1979)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0093603/" title="Mani Ratnam (dir.), Kamal Haasan, Saranya Ponvannan">Nayakan</a>
 <span class="secondaryInfo">(1987)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0367495/" title="Sundar C. (dir.), Kamal Haasan, Madhavan">Anbe Sivam</a>
 <span class="secondaryInfo">(2003)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt12361178/" title="Jeethu Joseph (dir.), Mohanlal, Meena">Drishyam 2</a>
 <span class="secondaryInfo">(2021)</span>
 </td>,
 <td class="titleColumn">
    

In [25]:
Movie_names = []
for x in movie_name[0:100]:
    for y in x.find_all('a'):
        Movie_names.append(y.text)
Movie_names

['Pather Panchali',
 'Gol Maal',
 'Nayakan',
 'Anbe Sivam',
 'Drishyam 2',
 'Apur Sansar',
 'Kireedam',
 'Natsamrat',
 'Pariyerum Perumal',
 'Black Friday',
 'Manichitrathazhu',
 'Thevar Magan',
 '96',
 'Ratsasan',
 'Kumbalangi Nights',
 '3 Idiots',
 'Taare Zameen Par',
 'Visaaranai',
 'Dangal',
 'Thalapathi',
 'Aparajito',
 'Jaane Bhi Do Yaaro',
 'Pyaasa',
 'Asuran',
 'Guide',
 'Kannathil Muthamittal',
 'Anand',
 'Jersey',
 'Chupke Chupke',
 'Kaithi',
 'Thani Oruvan',
 'Drishyam',
 'Vikram Vedha',
 'Vada Chennai',
 'Soorarai Pottru',
 'Peranbu',
 'Aruvi',
 'Khosla Ka Ghosla!',
 'Super Deluxe',
 'Mahanati',
 'Agent Sai Srinivasa Athreya',
 'Tumbbad',
 'Andhadhun',
 'Kaakkaa Muttai',
 'Premam',
 'Dhuruvangal Pathinaaru',
 'Satya',
 'Shahid',
 'Bangalore Days',
 'Mudhalvan',
 'Soodhu Kavvum',
 'Anniyan',
 'Papanasam',
 'Gangs of Wasseypur',
 'Jigarthanda',
 'Bhaag Milkha Bhaag',
 'Paan Singh Tomar',
 'Talvar',
 'Swades: We, the People',
 'Sholay',
 'Hera Pheri',
 'Sairat',
 'Nil Battey S

In [26]:
IMDB_rating = soup.find_all('td', class_='ratingColumn imdbRating')
IMDB_rating [0:100]

[<td class="ratingColumn imdbRating">
 <strong title="8.5 based on 23,911 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 18,069 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 16,557 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 17,135 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 21,253 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 12,244 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 6,261 user ratings">8.4</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 5,380 user ratings">8.4</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 9,050 user ratings">8.4</strong>
 </td>,
 <td class="ratingColu

In [27]:
imdb_rating=[]
for j in IMDB_rating[0:100]:
    imdb_rating.append(j.text.replace('\n',""))
imdb_rating

['8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.1',
 '8.0',
 '8.0',
 '8.0',
 '8.0',
 '8.0',
 '8.0']

In [28]:
year_of_release = soup.find_all('span',class_='secondaryInfo')
year_of_release

[<span class="secondaryInfo">(1955)</span>,
 <span class="secondaryInfo">(1979)</span>,
 <span class="secondaryInfo">(1987)</span>,
 <span class="secondaryInfo">(2003)</span>,
 <span class="secondaryInfo">(2021)</span>,
 <span class="secondaryInfo">(1959)</span>,
 <span class="secondaryInfo">(1989)</span>,
 <span class="secondaryInfo">(2016)</span>,
 <span class="secondaryInfo">(2018)</span>,
 <span class="secondaryInfo">(2004)</span>,
 <span class="secondaryInfo">(1993)</span>,
 <span class="secondaryInfo">(1992)</span>,
 <span class="secondaryInfo">(2018)</span>,
 <span class="secondaryInfo">(2018)</span>,
 <span class="secondaryInfo">(2019)</span>,
 <span class="secondaryInfo">(2009)</span>,
 <span class="secondaryInfo">(2007)</span>,
 <span class="secondaryInfo">(2015)</span>,
 <span class="secondaryInfo">(2016)</span>,
 <span class="secondaryInfo">(1991)</span>,
 <span class="secondaryInfo">(1956)</span>,
 <span class="secondaryInfo">(1983)</span>,
 <span class="secondaryInfo">(19

In [29]:
Year_of_release=[]
for k in year_of_release[0:100]:
    Year_of_release.append(k.text.replace("(",""))

Years_of_release=[]
for j in Year_of_release:
    Years_of_release.append(j.replace(")",""))
print(Years_of_release)

['1955', '1979', '1987', '2003', '2021', '1959', '1989', '2016', '2018', '2004', '1993', '1992', '2018', '2018', '2019', '2009', '2007', '2015', '2016', '1991', '1956', '1983', '1957', '2019', '1965', '2002', '1971', '2019', '1975', '2019', '2015', '2013', '2017', '2018', '2020', '2018', '2016', '2006', '2019', '2018', '2019', '2018', '2018', '2014', '2015', '2016', '1998', '2012', '2014', '1999', '2013', '2005', '2015', '2012', '2014', '2013', '2012', '2015', '2004', '1975', '2000', '2016', '2015', '2015', '2003', '2005', '2007', '1964', '1992', '1960', '2012', '2010', '2006', '2013', '2011', '2019', '2008', '2003', '2001', '1999', '2015', '2016', '1994', '2012', '1992', '2001', '2000', '2017', '2012', '1995', '2014', '2018', '2005', '2013', '2016', '2012', '2002', '2006', '2003', '1995']


In [30]:
df1 = pd.DataFrame({})
df1['Rank'] = list(range(1,101))
df1['movie_titles'] = Movie_names
df1['imdb_ratings'] = imdb_rating
df1['year'] = Years_of_release
df1

Unnamed: 0,Rank,movie_titles,imdb_ratings,year
0,1,Pather Panchali,8.5,1955
1,2,Gol Maal,8.5,1979
2,3,Nayakan,8.5,1987
3,4,Anbe Sivam,8.5,2003
4,5,Drishyam 2,8.5,2021
...,...,...,...,...
95,96,Barfi!,8.0,2012
96,97,The Legend of Bhagat Singh,8.0,2002
97,98,Bommarillu,8.0,2006
98,99,Maqbool,8.0,2003


In [31]:
# Save data in csv file
IMDB_movies = pd.DataFrame(df1)
IMDB_movies.to_csv("imdb_top_100_indian_movies.csv")

#### 4. Write a python program to scrap book name, author name, genre and book review of any 5 books from ‘www.bookpage.com’

In [32]:
page = requests.get("https://bookpage.com/reviews?book_genre=science&page=1")
page

<Response [200]>

In [33]:
page.content

b'<!DOCTYPE html>\n<html lang=\'en\' xmlns:fb=\'http://www.facebook.com/2008/fbml\' xmlns:og=\'http://opengraphprotocol.org/schema/\'>\n<head>\n<title>Science Book Reviews | BookPage</title>\n<meta name="description" content="Science book recommendations of the best new books and more.">\n<meta name="keywords" content="book reviews, books and literature, writing and writers">\n<link rel="canonical" href="https://bookpage.com/reviews?book_genre=science&amp;page=1">\n<meta property="og:site_name" content="BookPage.com">\n<meta property="og:title" content="Science Book Reviews">\n<meta property="og:description" content="Science book recommendations of the best new books and more.">\n<meta property="og:type" content="website">\n<meta property="og:url" content="https://bookpage.com/reviews?book_genre=science&amp;page=1">\n<meta property="og:image" content="//www.bookpage.com/default_image.jpg">\n<meta name="twitter:card" content="summary">\n<meta name="twitter:site" content="@bookpage">\n<m

In [34]:
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<title>Science Book Reviews | BookPage</title>
<meta content="Science book recommendations of the best new books and more." name="description"/>
<meta content="book reviews, books and literature, writing and writers" name="keywords"/>
<link href="https://bookpage.com/reviews?book_genre=science&amp;page=1" rel="canonical"/>
<meta content="BookPage.com" property="og:site_name"/>
<meta content="Science Book Reviews" property="og:title"/>
<meta content="Science book recommendations of the best new books and more." property="og:description"/>
<meta content="website" property="og:type"/>
<meta content="https://bookpage.com/reviews?book_genre=science&amp;page=1" property="og:url"/>
<meta content="//www.bookpage.com/default_image.jpg" property="og:image"/>
<meta content="summary" name="twitter:card"/>
<meta content="@bookpage" name="twitter:site"/>
<meta content

In [35]:
book_name = soup.find_all('h4', class_='italic')
book_name [0:5]

[<h4 class="italic">
 <a href="/reviews/26071-walter-isaacson-code-breaker-biography"> <span style="font-style:normal;">★ </span>The Code Breaker</a>
 </h4>,
 <h4 class="italic">
 <a href="/reviews/25912-annalee-newitz-four-lost-cities-history">Four Lost Cities</a>
 </h4>,
 <h4 class="italic">
 <a href="/reviews/25859-john-colapinto-this-voice-nonfiction">This Is the Voice</a>
 </h4>,
 <h4 class="italic">
 <a href="/reviews/25845-david-w-brown-mission-history">The Mission</a>
 </h4>,
 <h4 class="italic">
 <a href="/reviews/25794-michael-e-mann-new-climate-war-nonfiction">The New Climate War</a>
 </h4>]

In [36]:
books= []
for i in book_name[0:5]:
    books.append(i.text.replace('\n',""))
books

[' ★ The Code Breaker',
 'Four Lost Cities',
 'This Is the Voice',
 'The Mission',
 'The New Climate War']

In [37]:
books_name = []
for i in books:
    books_name.append(i.replace(' ★ ',""))
books_name

['The Code Breaker',
 'Four Lost Cities',
 'This Is the Voice',
 'The Mission',
 'The New Climate War']

In [38]:
author = soup.find_all('p',class_='sans bold')
author[0:5]

[<p class="sans bold">
 Walter Isaacson
 </p>,
 <p class="sans bold">
 Annalee Newitz
 </p>,
 <p class="sans bold">
 John Colapinto
 </p>,
 <p class="sans bold">
 David W. Brown
 </p>,
 <p class="sans bold">
 Michael E. Mann
 </p>]

In [39]:
authors_name = []
for i in author[0:5]:
    authors_name.append(i.text.replace('\n',""))
authors_name

['Walter Isaacson',
 'Annalee Newitz',
 'John Colapinto',
 'David W. Brown',
 'Michael E. Mann']

In [40]:
genre = soup.find_all('p',class_='genre-links hidden-phone')
genre[0:5]

[<p class="genre-links hidden-phone">
 <a href="/search?book_genre=nonfiction">Nonfiction</a>
  / 
 <a href="/search?book_genre=biography">Biography</a>
  / 
 <a href="/search?book_genre=science">Science</a>
 </p>,
 <p class="genre-links hidden-phone">
 <a href="/search?book_genre=nonfiction">Nonfiction</a>
  / 
 <a href="/search?book_genre=history">History</a>
  / 
 <a href="/search?book_genre=science">Science</a>
 </p>,
 <p class="genre-links hidden-phone">
 <a href="/search?book_genre=nonfiction">Nonfiction</a>
  / 
 <a href="/search?book_genre=science">Science</a>
  / 
 <a href="/search?book_genre=anatomy">Anatomy</a>
 </p>,
 <p class="genre-links hidden-phone">
 <a href="/search?book_genre=nonfiction">Nonfiction</a>
  / 
 <a href="/search?book_genre=science">Science</a>
  / 
 <a href="/search?book_genre=history">History</a>
 </p>,
 <p class="genre-links hidden-phone">
 <a href="/search?book_genre=nonfiction">Nonfiction</a>
  / 
 <a href="/search?book_genre=science">Science</a>
  /

In [41]:
genres =  []
for i in genre[0:5]:
    genres.append(i.text.replace("\n",""))
genres

['Nonfiction / Biography / Science',
 'Nonfiction / History / Science',
 'Nonfiction / Science / Anatomy',
 'Nonfiction / Science / History',
 'Nonfiction / Science / Science & Nature']

In [42]:
# extract review of each book. Every reviews are in different web pages.
page1 = requests.get("https://bookpage.com/reviews/26071-walter-isaacson-code-breaker-nonfiction#.YFSh9K8zbIU")
page2 = requests.get("https://bookpage.com/reviews/25912-annalee-newitz-four-lost-cities-nonfiction#.YFSiFq8zbIU")
page3 = requests.get("https://bookpage.com/reviews/25859-john-colapinto-this-voice-nonfiction#.YFSiK68zbIU")
page4 = requests.get("https://bookpage.com/reviews/25845-david-w-brown-mission-nonfiction#.YFSiPK8zbIU")
page5 = requests.get("https://bookpage.com/reviews/25794-michael-e-mann-new-climate-war-nonfiction#.YFSiS68zbIU")
print(page1)
print(page2)
print(page3)
print(page4)
print(page5)

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


In [43]:
print(page1.content)
print(page2.content)
print(page3.content)
print(page4.content)
print(page5.content)

b'<!DOCTYPE html>\n<html lang=\'en\' xmlns:fb=\'http://www.facebook.com/2008/fbml\' xmlns:og=\'http://opengraphprotocol.org/schema/\'>\n<head>\n<title>Book Review - The Code Breaker by Walter Isaacson | BookPage</title>\n<meta name="description" content="Like Lab Girl on steroids, The Code Breaker paints a detailed picture of how Nobel Prize-winning scientist Jennifer Douda works.">\n<link rel="canonical" href="https://bookpage.com/reviews/26071-walter-isaacson-code-breaker-biography">\n<meta property="og:site_name" content="BookPage.com">\n<meta property="og:title" content="Book Review - The Code Breaker by Walter Isaacson">\n<meta property="og:description" content="Like Lab Girl on steroids, The Code Breaker paints a detailed picture of how Nobel Prize-winning scientist Jennifer Douda works.">\n<meta property="og:type" content="article">\n<meta property="og:url" content="https://bookpage.com/reviews/26071-walter-isaacson-code-breaker-biography">\n<meta property="og:image" content="/i

In [44]:
soup1 = BeautifulSoup(page1.content,'html.parser')
soup2 = BeautifulSoup(page2.content,'html.parser')
soup3 = BeautifulSoup(page3.content,'html.parser')
soup4 = BeautifulSoup(page4.content,'html.parser')
soup5 = BeautifulSoup(page5.content,'html.parser')
print(soup1)
print(soup2)
print(soup3)
print(soup4)
print(soup5)

<!DOCTYPE html>

<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<title>Book Review - The Code Breaker by Walter Isaacson | BookPage</title>
<meta content="Like Lab Girl on steroids, The Code Breaker paints a detailed picture of how Nobel Prize-winning scientist Jennifer Douda works." name="description"/>
<link href="https://bookpage.com/reviews/26071-walter-isaacson-code-breaker-biography" rel="canonical"/>
<meta content="BookPage.com" property="og:site_name"/>
<meta content="Book Review - The Code Breaker by Walter Isaacson" property="og:title"/>
<meta content="Like Lab Girl on steroids, The Code Breaker paints a detailed picture of how Nobel Prize-winning scientist Jennifer Douda works." property="og:description"/>
<meta content="article" property="og:type"/>
<meta content="https://bookpage.com/reviews/26071-walter-isaacson-code-breaker-biography" property="og:url"/>
<meta content="/images/medium/missing.png" prope

In [45]:
review1 = soup1.find_all('div',class_='article-body')
review2 = soup2.find_all('div',class_='article-body')
review3 = soup3.find_all('div',class_='article-body')
review4 = soup4.find_all('div',class_='article-body')
review5 = soup5.find_all('div',class_='article-body')
print(review1)
print(review2)
print(review3)
print(review4)
print(review5)

[<div class="article-body">
<p>
<p dir="ltr">Thank goodness Jennifer Doudna didn’t listen to her high school guidance counselor, who told her that girls don’t do science. Instead, Doudna followed her passion and pursued biochemistry, inspired by her childhood explorations of beaches, meadows and lava flow caves in her hometown of Hilo, Hawaii. When Doudna read James Watson’s book <em>The Double Helix</em> as a sixth grader, she realized that “science can be very exciting, like being on a trail of a cool mystery and you’re getting a clue here and a clue there. And then you put the pieces together.”</p>
<p dir="ltr">That’s exactly the feeling you’ll have while reading Walter Isaacson’s marvelous biography <strong>The Code Breaker: Jennifer Doudna, Gene Editing, and the Future of the Human Race</strong>. It’s a hefty but inspiring book that chronicles Doudna’s and others’ development of the gene-editing tool CRISPR. With his dynamic and formidable style, Isaacson explains the long scienti

In [46]:
books_review=[]
for i in review1:
    books_review.append(i.text.replace('\n',""))
for j in review2:
    books_review.append(j.text.replace('\n',""))
for k in review3:
    books_review.append(k.text.replace('\n',""))
for l in review4:
    books_review.append(l.text.replace('\n',""))
for m in review5:
    books_review.append(m.text.replace('\n',""))
books_review

['Thank goodness Jennifer Doudna didn’t listen to her high school guidance counselor, who told her that girls don’t do science. Instead, Doudna followed her passion and pursued biochemistry, inspired by her childhood explorations of beaches, meadows and lava flow caves in her hometown of Hilo, Hawaii. When Doudna read James Watson’s book The Double Helix as a sixth grader, she realized that “science can be very exciting, like being on a trail of a cool mystery and you’re getting a clue here and a clue there. And then you put the pieces together.”That’s exactly the feeling you’ll have while reading Walter Isaacson’s marvelous biography The Code Breaker: Jennifer Doudna, Gene Editing, and the Future of the Human Race. It’s a hefty but inspiring book that chronicles Doudna’s and others’ development of the gene-editing tool CRISPR. With his dynamic and formidable style, Isaacson explains the long scientific journey that led to this tool’s discovery and the exciting developments that have f

In [47]:
df2 = pd.DataFrame({})
df2['Books_name'] = books_name
df2["Authors_name"] = authors_name
df2["Genres"] = genres
df2["Review"] = books_review
df2

Unnamed: 0,Books_name,Authors_name,Genres,Review
0,The Code Breaker,Walter Isaacson,Nonfiction / Biography / Science,Thank goodness Jennifer Doudna didn’t listen t...
1,Four Lost Cities,Annalee Newitz,Nonfiction / History / Science,When Hurricane Katrina devastated New Orleans ...
2,This Is the Voice,John Colapinto,Nonfiction / Science / Anatomy,After almost ruining his voice while trying to...
3,The Mission,David W. Brown,Nonfiction / Science / History,The possibility that other life could exist wi...
4,The New Climate War,Michael E. Mann,Nonfiction / Science / Science & Nature,Given the recent increase in extreme weather e...


### 5. Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. You have to scrape:

#### i) Top 10 ODI teams in men’s cricket along with the records for matches, points and rating.

In [48]:
page = requests.get("https://www.icc-cricket.com/rankings/mens/team-rankings/odi")
page

<Response [200]>

In [49]:
page.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n    <meta name="twitter:title" content="ICC Ranking for ODI teams International Cricket Council"/>\n<meta property="og:type" content="website"/>\n<meta property="twitter:card" content="summary_large_image"/>\n<meta name="description" content="Official International Cricket Council ranking for One Day International (ODI) cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams."/>\n<meta property="twitter:site" content="@icc"/>\n<meta name="twitter:description" content="Official International Cricket Council ranking for One Day International (ODI) cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams."/>\n<meta name="twitter:image" content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg"/>\n<meta property="og:title" content="ICC Ranking for ODI teams International Cricket Council"/>\n<meta property

In [50]:
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="ICC Ranking for ODI teams International Cricket Council" name="twitter:title"/>
<meta content="website" property="og:type"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="Official International Cricket Council ranking for One Day International (ODI) cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams." name="description"/>
<meta content="@icc" property="twitter:site"/>
<meta content="Official International Cricket Council ranking for One Day International (ODI) cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams." name="twitter:description"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" name="twitter:image"/>
<meta content="ICC Ranking for ODI teams International Cricket Council" property="og:title"/>
<meta content="https://www.icc-c

In [51]:
teams = soup.find_all('span',class_='u-hide-phablet')
print(teams[0:19])

matches_1 = soup.find_all('td',class_="rankings-block__banner--matches")
print(matches_1)
matches = soup.find_all('td',class_="table-body__cell u-center-text")
print(matches[0:18])

points_1= soup.find_all('td',class_="rankings-block__banner--points")
print(points_1)
points = soup.find_all('td',class_="table-body__cell u-center-text")
print(points[0:18])

ratings_1 = soup.find_all('td',class_="rankings-block__banner--rating u-text-right")
print(ratings_1)
ratings = soup.find_all('td',class_="table-body__cell u-text-right rating")
print(ratings[0:18])

[<span class="u-hide-phablet">England</span>, <span class="u-hide-phablet">India</span>, <span class="u-hide-phablet">New Zealand</span>, <span class="u-hide-phablet">Australia</span>, <span class="u-hide-phablet">South Africa</span>, <span class="u-hide-phablet">Pakistan</span>, <span class="u-hide-phablet">Bangladesh</span>, <span class="u-hide-phablet">Sri Lanka</span>, <span class="u-hide-phablet">West Indies</span>, <span class="u-hide-phablet">Afghanistan</span>, <span class="u-hide-phablet">Ireland</span>, <span class="u-hide-phablet">Netherlands</span>, <span class="u-hide-phablet">Zimbabwe</span>, <span class="u-hide-phablet">Oman</span>, <span class="u-hide-phablet">Scotland</span>, <span class="u-hide-phablet">UAE</span>, <span class="u-hide-phablet">Nepal</span>, <span class="u-hide-phablet">Namibia</span>, <span class="u-hide-phablet">United States</span>]
[<td class="rankings-block__banner--matches">44</td>]
[<td class="table-body__cell u-center-text">52</td>, <td class="

In [52]:
team=[]
for i in teams[0:19]:
    team.append(i.text)
print(team)

match=[]
for m in matches_1:
    match.append(m.text)
for j in matches[0:18]:
    match.append(j.text)
print(match)

point=[]
for n in points_1:
    point.append(n.text)
for k in points[0:18]:
    point.append(k.text)
print(point)

rating=[]
for o in ratings_1:
    rating.append(o.text.replace("\n                            123\n                            \n\n","123"))
for l in ratings[0:18]:
    rating.append(l.text)
print(rating)

['England', 'India', 'New Zealand', 'Australia', 'South Africa', 'Pakistan', 'Bangladesh', 'Sri Lanka', 'West Indies', 'Afghanistan', 'Ireland', 'Netherlands', 'Zimbabwe', 'Oman', 'Scotland', 'UAE', 'Nepal', 'Namibia', 'United States']
['44', '52', '6,102', '33', '3,857', '39', '4,344', '31', '3,345', '35', '3,490', '38', '3,432', '42', '3,372', '49', '3,802', '31', '1,844']
['5,405', '52', '6,102', '33', '3,857', '39', '4,344', '31', '3,345', '35', '3,490', '38', '3,432', '42', '3,372', '49', '3,802', '31', '1,844']
['123', '117', '117', '111', '108', '100', '90', '80', '78', '59', '45', '44', '42', '40', '26', '21', '18', '17', '13']


In [53]:
del team[10:]
del match[2:20:2]
del point[1:20:2]
del rating[10:]

In [54]:
df = pd.DataFrame({})
df['Ranking'] = list(range(1,11))
df['Teams'] = team
df['Matches'] = match
df['Points'] = point
df['Ratings'] = rating
df

Unnamed: 0,Ranking,Teams,Matches,Points,Ratings
0,1,England,44,5405,123
1,2,India,52,6102,117
2,3,New Zealand,33,3857,117
3,4,Australia,39,4344,111
4,5,South Africa,31,3345,108
5,6,Pakistan,35,3490,100
6,7,Bangladesh,38,3432,90
7,8,Sri Lanka,42,3372,80
8,9,West Indies,49,3802,78
9,10,Afghanistan,31,1844,59


#### ii) Top 10 ODI Batsmen in men along with the records of their team and rating.

In [55]:
page = requests.get("https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting")
page

<Response [200]>

In [56]:
page.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n    <meta name="twitter:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:type" content="website"/>\n<meta property="twitter:card" content="summary_large_image"/>\n<meta name="description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta property="twitter:site" content="@icc"/>\n<meta name="twitter:description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta name="twitter:image" content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg"/>\n<meta property="og:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:image" content="https://www.icc-cricket.com/resources/ver/i/elements/defaul

In [57]:
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="Live Cricket Scores &amp; News International Cricket Council" name="twitter:title"/>
<meta content="website" property="og:type"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="description"/>
<meta content="@icc" property="twitter:site"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="twitter:description"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" name="twitter:image"/>
<meta content="Live Cricket Scores &amp; News International Cricket Council" property="og:title"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" property="og:

In [58]:
name_1 = soup.find_all('div',class_='rankings-block__banner--name-large')
print(name_1)
name = soup.find_all('td',class_="table-body__cell rankings-table__name name")
print(name[0:9])

country_1 = soup.find_all('div',class_='rankings-block__banner--nationality')
print(country_1)
country = soup.find_all('span',class_="table-body__logo-text")
print(country[0:9])

rating_1 = soup.find_all('div',class_='rankings-block__banner--rating')
print(rating_1)
rating = soup.find_all('td',class_="table-body__cell rating")
print(rating[0:9])

record_1 = soup.find_all('span',class_="rankings-block__career-best-text")
print(record_1)
record = soup.find_all('td',class_="table-body__cell u-text-right u-hide-phablet")
print(record[0:9])

[<div class="rankings-block__banner--name-large">Virat Kohli</div>]
[<td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/107">Rohit Sharma</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/2759">Babar Azam</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/226">Ross Taylor</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/167">Aaron Finch</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/24">Francois du Plessis</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/170">David Warner</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/2751">Shai Hope</a>
</td>, <td class="table-body__cell rankings-table__name name">

In [59]:
Player_names=[]
for i in name_1:
    Player_names.append(i.text)
for j in name[0:9]:
    Player_names.append(j.text.replace("\n",""))
print(Player_names)

Country=[]
for k in country_1:
    Country.append(k.text.replace("\n\nIND\n","IND"))
for l in country[0:9]:
    Country.append(l.text)
print(Country)

Ratings=[]
for m in rating_1:
    Ratings.append(m.text)
for n in rating[0:9]:
    Ratings.append(n.text)
print(Ratings)

records=[]
for o in record_1:
    records.append(o.text.replace("\n                                911 v England, 12/07/2018\n                        ","911 v England, 12/07/2018"))
for p in record[0:9]:
    records.append(p.text.replace("\n",""))
Records_1=[]
for q in records:
    Records_1.append(q.replace("                        ",""))
Records=[]
for r in Records_1:
    Records.append(r.replace("        ",""))
print(Records)

['Virat Kohli', 'Rohit Sharma', 'Babar Azam', 'Ross Taylor', 'Aaron Finch', 'Francois du Plessis', 'David Warner', 'Shai Hope', 'Kane Williamson', 'Quinton de Kock']
['IND', 'IND', 'PAK', 'NZ', 'AUS', 'SA', 'AUS', 'WI', 'NZ', 'SA']
['870', '842', '837', '818', '791', '790', '773', '773', '765', '755']
['911 v England, 12/07/2018', '885 v Sri Lanka, 06/07/2019', '846 v Sri Lanka, 20/10/2017', '841 v Bangladesh, 05/06/2019', '798 v England, 25/06/2019', '820 v Australia, 06/07/2019', '880 v Pakistan, 26/01/2017', '808 v Bangladesh, 17/05/2019', '799 v India, 09/07/2019', '813 v Sri Lanka, 10/03/2019']


In [60]:
df = pd.DataFrame({})
df['Rank'] = list(range(1,11))
df['Player_names'] = Player_names
df['Country'] = Country
df['Ratings'] = Ratings
df['Records'] = Records
df

Unnamed: 0,Rank,Player_names,Country,Ratings,Records
0,1,Virat Kohli,IND,870,"911 v England, 12/07/2018"
1,2,Rohit Sharma,IND,842,"885 v Sri Lanka, 06/07/2019"
2,3,Babar Azam,PAK,837,"846 v Sri Lanka, 20/10/2017"
3,4,Ross Taylor,NZ,818,"841 v Bangladesh, 05/06/2019"
4,5,Aaron Finch,AUS,791,"798 v England, 25/06/2019"
5,6,Francois du Plessis,SA,790,"820 v Australia, 06/07/2019"
6,7,David Warner,AUS,773,"880 v Pakistan, 26/01/2017"
7,8,Shai Hope,WI,773,"808 v Bangladesh, 17/05/2019"
8,9,Kane Williamson,NZ,765,"799 v India, 09/07/2019"
9,10,Quinton de Kock,SA,755,"813 v Sri Lanka, 10/03/2019"


#### iii) Top 10 ODI bowlers along with the records of their team and rating.

In [61]:
page = requests.get("https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling")
page

<Response [200]>

In [62]:
page.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n    <meta name="twitter:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:type" content="website"/>\n<meta property="twitter:card" content="summary_large_image"/>\n<meta name="description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta property="twitter:site" content="@icc"/>\n<meta name="twitter:description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta name="twitter:image" content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg"/>\n<meta property="og:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:image" content="https://www.icc-cricket.com/resources/ver/i/elements/defaul

In [63]:
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="Live Cricket Scores &amp; News International Cricket Council" name="twitter:title"/>
<meta content="website" property="og:type"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="description"/>
<meta content="@icc" property="twitter:site"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="twitter:description"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" name="twitter:image"/>
<meta content="Live Cricket Scores &amp; News International Cricket Council" property="og:title"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" property="og:

In [64]:
name_1 = soup.find_all('div',class_='rankings-block__banner--name-large')
print(name_1)
name = soup.find_all('td',class_="table-body__cell rankings-table__name name")
print(name[0:9])

country_1 = soup.find_all('div',class_='rankings-block__banner--nationality')
print(country_1)
country = soup.find_all('span',class_="table-body__logo-text")
print(country[0:9])

rating_1 = soup.find_all('div',class_='rankings-block__banner--rating')
print(rating_1)
rating = soup.find_all('td',class_="table-body__cell rating")
print(rating[0:9])

record_1 = soup.find_all('span',class_="rankings-block__career-best-text")
print(record_1)
record = soup.find_all('td',class_="table-body__cell u-text-right u-hide-phablet")
print(record[0:9])

[<div class="rankings-block__banner--name-large">Trent Boult</div>]
[<td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/4572">Mujeeb Ur Rahman</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/1124">Jasprit Bumrah</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/1597">Mehedi Hasan</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/967">Chris Woakes</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/1664">Kagiso Rabada</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/857">Josh Hazlewood</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/mens/player-rankings/1594">Mustafizur Rahman</a>
</td>, <td class="table-body__cell rankings-

In [65]:
Player_names=[]
for i in name_1:
    Player_names.append(i.text)
for j in name[0:9]:
    Player_names.append(j.text.replace("\n",""))
print(Player_names)

Country=[]
for k in country_1:
    Country.append(k.text.replace("\n\nNZ\n","NZ"))
for l in country[0:9]:
    Country.append(l.text)
print(Country)

Ratings=[]
for m in rating_1:
    Ratings.append(m.text)
for n in rating[0:9]:
    Ratings.append(n.text)
print(Ratings)

records=[]
for o in record_1:
    records.append(o.text.replace("\n                                770 v West Indies, 22/06/2019\n                        ","770 v West Indies, 22/06/2019"))
for p in record[0:9]:
    records.append(p.text.replace("\n",""))
Records_1=[]
for q in records:
    Records_1.append(q.replace("                        ",""))
Records=[]
for r in Records_1:
    Records.append(r.replace("        ",""))
print(Records)

['Trent Boult', 'Mujeeb Ur Rahman', 'Jasprit Bumrah', 'Mehedi Hasan', 'Chris Woakes', 'Kagiso Rabada', 'Josh Hazlewood', 'Mustafizur Rahman', 'Mohammad Amir', 'Pat Cummins']
['NZ', 'AFG', 'IND', 'BAN', 'ENG', 'SA', 'AUS', 'BAN', 'PAK', 'AUS']
['722', '708', '700', '694', '675', '665', '660', '658', '647', '646']
['770 v West Indies, 22/06/2019', '712 v Ireland, 24/01/2021', '841 v West Indies, 01/11/2018', '694 v West Indies, 25/01/2021', '676 v New Zealand, 14/07/2019', '724 v England, 29/05/2017', '733 v England, 26/01/2018', '695 v West Indies, 14/12/2018', '663 v Sri Lanka, 02/10/2019', '729 v Pakistan, 12/06/2019']


In [66]:
df = pd.DataFrame({})
df['Rank'] = list(range(1,11))
df['Player_names'] = Player_names
df['Country'] = Country
df['Ratings'] = Ratings
df['Records'] = Records
df

Unnamed: 0,Rank,Player_names,Country,Ratings,Records
0,1,Trent Boult,NZ,722,"770 v West Indies, 22/06/2019"
1,2,Mujeeb Ur Rahman,AFG,708,"712 v Ireland, 24/01/2021"
2,3,Jasprit Bumrah,IND,700,"841 v West Indies, 01/11/2018"
3,4,Mehedi Hasan,BAN,694,"694 v West Indies, 25/01/2021"
4,5,Chris Woakes,ENG,675,"676 v New Zealand, 14/07/2019"
5,6,Kagiso Rabada,SA,665,"724 v England, 29/05/2017"
6,7,Josh Hazlewood,AUS,660,"733 v England, 26/01/2018"
7,8,Mustafizur Rahman,BAN,658,"695 v West Indies, 14/12/2018"
8,9,Mohammad Amir,PAK,647,"663 v Sri Lanka, 02/10/2019"
9,10,Pat Cummins,AUS,646,"729 v Pakistan, 12/06/2019"


### 6. Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. You have to scrape:

#### i) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.

In [67]:
page = requests.get("https://www.icc-cricket.com/rankings/womens/team-rankings/odi")
page

<Response [200]>

In [68]:
page.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n    <meta name="twitter:title" content="ICC Ranking for ODI teams International Cricket Council"/>\n<meta property="og:type" content="website"/>\n<meta property="twitter:card" content="summary_large_image"/>\n<meta name="description" content="Official International Cricket Council rankings for test match cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams."/>\n<meta property="twitter:site" content="@icc"/>\n<meta name="twitter:description" content="Official International Cricket Council rankings for test match cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams."/>\n<meta name="twitter:image" content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg"/>\n<meta property="og:title" content="ICC Ranking for ODI teams International Cricket Council"/>\n<meta property="og:image" content="https://www

In [69]:
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="ICC Ranking for ODI teams International Cricket Council" name="twitter:title"/>
<meta content="website" property="og:type"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="Official International Cricket Council rankings for test match cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams." name="description"/>
<meta content="@icc" property="twitter:site"/>
<meta content="Official International Cricket Council rankings for test match cricket teams. Discover latest ICC rankings table, predict upcoming matches, see points and ratings for all teams." name="twitter:description"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" name="twitter:image"/>
<meta content="ICC Ranking for ODI teams International Cricket Council" property="og:title"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/eleme

In [70]:
teams = soup.find_all('span',class_='u-hide-phablet')
print(teams[0:19])

matches_1 = soup.find_all('td',class_="rankings-block__banner--matches")
print(matches_1)
matches = soup.find_all('td',class_="table-body__cell u-center-text")
print(matches[0:18])

points_1= soup.find_all('td',class_="rankings-block__banner--points")
print(points_1)
points = soup.find_all('td',class_="table-body__cell u-center-text")
print(points[0:18])

ratings_1 = soup.find_all('td',class_="rankings-block__banner--rating u-text-right")
print(ratings_1)
ratings = soup.find_all('td',class_="table-body__cell u-text-right rating")
print(ratings[0:18])

[<span class="u-hide-phablet">Australia</span>, <span class="u-hide-phablet">South Africa</span>, <span class="u-hide-phablet">England</span>, <span class="u-hide-phablet">India</span>, <span class="u-hide-phablet">New Zealand</span>, <span class="u-hide-phablet">West Indies</span>, <span class="u-hide-phablet">Pakistan</span>, <span class="u-hide-phablet">Bangladesh</span>, <span class="u-hide-phablet">Sri Lanka</span>, <span class="u-hide-phablet">Ireland</span>, <span class="icn icn-facebook-circle u-hide-phablet"></span>, <span class="icn icn-twitter-circle u-hide-phablet"></span>, <span class="icn icn-instagram-circle u-hide-phablet"></span>, <span class="icn icn-youtube-circle u-hide-phablet"></span>, <span class="icn icn-tiktok-circle u-hide-phablet"></span>]
[<td class="rankings-block__banner--matches">15</td>]
[<td class="table-body__cell u-center-text">24</td>, <td class="table-body__cell u-center-text">2,828</td>, <td class="table-body__cell u-center-text">17</td>, <td class

In [71]:
team=[]
for i in teams[0:19]:
    team.append(i.text)
print(team)

match=[]
for m in matches_1:
    match.append(m.text)
for j in matches[0:18]:
    match.append(j.text)
print(match)

point=[]
for n in points_1:
    point.append(n.text)
for k in points[0:18]:
    point.append(k.text)
print(point)

rating=[]
for o in ratings_1:
    rating.append(o.text.replace("\n                            162\n                            \n\n","162"))
for l in ratings[0:18]:
    rating.append(l.text)
print(rating)

['Australia', 'South Africa', 'England', 'India', 'New Zealand', 'West Indies', 'Pakistan', 'Bangladesh', 'Sri Lanka', 'Ireland', '', '', '', '', '']
['15', '24', '2,828', '17', '1,993', '20', '2,226', '18', '1,696', '12', '1,025', '15', '1,101', '5', '306', '11', '519', '2', '25']
['2,436', '24', '2,828', '17', '1,993', '20', '2,226', '18', '1,696', '12', '1,025', '15', '1,101', '5', '306', '11', '519', '2', '25']
['162', '118', '117', '111', '94', '85', '73', '61', '47', '13']


In [72]:
del team[10:]
del match[2:20:2]
del point[1:20:2]
del rating[10:]

In [73]:
df = pd.DataFrame({})
df['Ranking'] = list(range(1,11))
df['Teams'] = team
df['Matches'] = match
df['Points'] = point
df['Ratings'] = rating
df

Unnamed: 0,Ranking,Teams,Matches,Points,Ratings
0,1,Australia,15,2436,162
1,2,South Africa,24,2828,118
2,3,England,17,1993,117
3,4,India,20,2226,111
4,5,New Zealand,18,1696,94
5,6,West Indies,12,1025,85
6,7,Pakistan,15,1101,73
7,8,Bangladesh,5,306,61
8,9,Sri Lanka,11,519,47
9,10,Ireland,2,25,13


#### ii) Top 10 women’s ODI players along with the records of their team and rating.

In [74]:
page=requests.get("https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting")
page

<Response [200]>

In [75]:
page.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n    <meta name="twitter:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:type" content="website"/>\n<meta property="twitter:card" content="summary_large_image"/>\n<meta name="description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta property="twitter:site" content="@icc"/>\n<meta name="twitter:description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta name="twitter:image" content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg"/>\n<meta property="og:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:image" content="https://www.icc-cricket.com/resources/ver/i/elements/defaul

In [76]:
soup = BeautifulSoup(page.content,"html.parser")
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="Live Cricket Scores &amp; News International Cricket Council" name="twitter:title"/>
<meta content="website" property="og:type"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="description"/>
<meta content="@icc" property="twitter:site"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="twitter:description"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" name="twitter:image"/>
<meta content="Live Cricket Scores &amp; News International Cricket Council" property="og:title"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" property="og:

In [77]:
name_1 = soup.find_all('div',class_='rankings-block__banner--name-large')
print(name_1)
name = soup.find_all('td',class_="table-body__cell rankings-table__name name")
print(name[0:9])

country_1 = soup.find_all('div',class_='rankings-block__banner--nationality')
print(country_1)
country = soup.find_all('span',class_="table-body__logo-text")
print(country[0:9])

rating_1 = soup.find_all('div',class_='rankings-block__banner--rating')
print(rating_1)
rating = soup.find_all('td',class_="table-body__cell rating")
print(rating[0:9])

record_1 = soup.find_all('span',class_="rankings-block__career-best-text")
print(record_1)
record = soup.find_all('td',class_="table-body__cell u-text-right u-hide-phablet")
print(record[0:9])

[<div class="rankings-block__banner--name-large">Lizelle Lee</div>]
[<td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/545">Tammy Beaumont</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/469">Meg Lanning</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/573">Stafanie Taylor</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/466">Alyssa Healy</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/487">Amy Satterthwaite</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/1809">Smriti Mandhana</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/3176">Laura Wolvaardt</a>
</td>, <td class="table-body__ce

In [78]:
Player_names=[]
for i in name_1:
    Player_names.append(i.text)
for j in name[0:9]:
    Player_names.append(j.text.replace("\n",""))
print(Player_names)

Country=[]
for k in country_1:
    Country.append(k.text.replace("\n\nSA\n","SA"))
for l in country[0:9]:
    Country.append(l.text)
print(Country)

Ratings=[]
for m in rating_1:
    Ratings.append(m.text)
for n in rating[0:9]:
    Ratings.append(n.text)
print(Ratings)

records=[]
for o in record_1:
    records.append(o.text.replace("\n                                773 v India, 14/03/2021\n                        ","773 v India, 14/03/2021"))
for p in record[0:9]:
    records.append(p.text.replace("\n",""))
Records_1=[]
for q in records:
    Records_1.append(q.replace("                        ",""))
Records=[]
for r in Records_1:
    Records.append(r.replace("        ",""))
print(Records)

['Lizelle Lee', 'Tammy Beaumont', 'Meg Lanning', 'Stafanie Taylor', 'Alyssa Healy', 'Amy Satterthwaite', 'Smriti Mandhana', 'Laura Wolvaardt', 'Mithali Raj', 'Ellyse Perry']
['SA', 'ENG', 'AUS', 'WI', 'AUS', 'NZ', 'IND', 'SA', 'IND', 'AUS']
['773', '765', '749', '746', '741', '740', '719', '699', '693', '691']
['773 v India, 14/03/2021', '765 v New Zealand, 28/02/2021', '834 v New Zealand, 24/02/2016', '765 v India, 02/03/2012', '741 v New Zealand, 07/10/2020', '756 v Australia, 02/03/2017', '797 v England, 28/02/2019', '725 v India, 07/03/2021', '839 v Australia, 24/12/2004', '766 v West Indies, 11/09/2019']


In [79]:
df = pd.DataFrame({})
df['Rank'] = list(range(1,11))
df['Player_names'] = Player_names
df['Country'] = Country
df['Ratings'] = Ratings
df['Records'] = Records
df

Unnamed: 0,Rank,Player_names,Country,Ratings,Records
0,1,Lizelle Lee,SA,773,"773 v India, 14/03/2021"
1,2,Tammy Beaumont,ENG,765,"765 v New Zealand, 28/02/2021"
2,3,Meg Lanning,AUS,749,"834 v New Zealand, 24/02/2016"
3,4,Stafanie Taylor,WI,746,"765 v India, 02/03/2012"
4,5,Alyssa Healy,AUS,741,"741 v New Zealand, 07/10/2020"
5,6,Amy Satterthwaite,NZ,740,"756 v Australia, 02/03/2017"
6,7,Smriti Mandhana,IND,719,"797 v England, 28/02/2019"
7,8,Laura Wolvaardt,SA,699,"725 v India, 07/03/2021"
8,9,Mithali Raj,IND,693,"839 v Australia, 24/12/2004"
9,10,Ellyse Perry,AUS,691,"766 v West Indies, 11/09/2019"


#### iii) Top 10 women’s ODI all-rounder along with the records of their team and rating.

In [80]:
page=requests.get("https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder")
page

<Response [200]>

In [81]:
page.content

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n    <meta name="twitter:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:type" content="website"/>\n<meta property="twitter:card" content="summary_large_image"/>\n<meta name="description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta property="twitter:site" content="@icc"/>\n<meta name="twitter:description" content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council."/>\n<meta name="twitter:image" content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg"/>\n<meta property="og:title" content="Live Cricket Scores & News International Cricket Council"/>\n<meta property="og:image" content="https://www.icc-cricket.com/resources/ver/i/elements/defaul

In [82]:
soup = BeautifulSoup(page.content,"html.parser")
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="Live Cricket Scores &amp; News International Cricket Council" name="twitter:title"/>
<meta content="website" property="og:type"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="description"/>
<meta content="@icc" property="twitter:site"/>
<meta content="Official ICC Cricket website - live matches, scores, news, highlights, commentary, rankings, videos and fixtures from the International Cricket Council." name="twitter:description"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" name="twitter:image"/>
<meta content="Live Cricket Scores &amp; News International Cricket Council" property="og:title"/>
<meta content="https://www.icc-cricket.com/resources/ver/i/elements/default-thumbnail.jpg" property="og:

In [83]:
name_1 = soup.find_all('div',class_='rankings-block__banner--name-large')
print(name_1)
name = soup.find_all('td',class_="table-body__cell rankings-table__name name")
print(name[0:9])

country_1 = soup.find_all('div',class_='rankings-block__banner--nationality')
print(country_1)
country = soup.find_all('span',class_="table-body__logo-text")
print(country[0:9])

rating_1 = soup.find_all('div',class_='rankings-block__banner--rating')
print(rating_1)
rating = soup.find_all('td',class_="table-body__cell rating")
print(rating[0:9])

record_1 = soup.find_all('span',class_="rankings-block__career-best-text")
print(record_1)
record = soup.find_all('td',class_="table-body__cell u-text-right u-hide-phablet")
print(record[0:9])

[<div class="rankings-block__banner--name-large">Ellyse Perry</div>]
[<td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/573">Stafanie Taylor</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/424">Marizanne Kapp</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/3192">Deepti Sharma</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/1800">Natalie Sciver</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/468">Jess Jonassen</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/478">Sophie Devine</a>
</td>, <td class="table-body__cell rankings-table__name name">
<a href="/rankings/womens/player-rankings/430">Dane van Niekerk</a>
</td>, <td class="table-body__ce

In [84]:
Player_names=[]
for i in name_1:
    Player_names.append(i.text)
for j in name[0:9]:
    Player_names.append(j.text.replace("\n",""))
print(Player_names)

Country=[]
for k in country_1:
    Country.append(k.text.replace("\n\nAUS\n","AUS"))
for l in country[0:9]:
    Country.append(l.text)
print(Country)

Ratings=[]
for m in rating_1:
    Ratings.append(m.text)
for n in rating[0:9]:
    Ratings.append(n.text)
print(Ratings)

records=[]
for o in record_1:
    records.append(o.text.replace("\n                                548 v West Indies, 11/09/2019\n                        ","548 v West Indies, 11/09/2019"))
for p in record[0:9]:
    records.append(p.text.replace("\n",""))
Records_1=[]
for q in records:
    Records_1.append(q.replace("                        ",""))
Records=[]
for r in Records_1:
    Records.append(r.replace("        ",""))
print(Records)

['Ellyse Perry', 'Stafanie Taylor', 'Marizanne Kapp', 'Deepti Sharma', 'Natalie Sciver', 'Jess Jonassen', 'Sophie Devine', 'Dane van Niekerk', 'Katherine Brunt', 'Ashleigh Gardner']
['AUS', 'WI', 'SA', 'IND', 'ENG', 'AUS', 'NZ', 'SA', 'ENG', 'AUS']
['460', '410', '390', '357', '349', '301', '274', '252', '236', '223']
['548 v West Indies, 11/09/2019', '559 v New Zealand, 10/10/2013', '412 v Pakistan, 23/01/2021', '397 v South Africa, 09/10/2019', '349 v New Zealand, 28/02/2021', '308 v West Indies, 11/09/2019', '305 v Australia, 05/10/2020', '421 v Sri Lanka, 11/02/2019', '270 v Sri Lanka, 16/03/2019', '241 v West Indies, 11/09/2019']


In [85]:
df = pd.DataFrame({})
df['Rank'] = list(range(1,11))
df['Player_names'] = Player_names
df['Country'] = Country
df['Ratings'] = Ratings
df['Records'] = Records
df

Unnamed: 0,Rank,Player_names,Country,Ratings,Records
0,1,Ellyse Perry,AUS,460,"548 v West Indies, 11/09/2019"
1,2,Stafanie Taylor,WI,410,"559 v New Zealand, 10/10/2013"
2,3,Marizanne Kapp,SA,390,"412 v Pakistan, 23/01/2021"
3,4,Deepti Sharma,IND,357,"397 v South Africa, 09/10/2019"
4,5,Natalie Sciver,ENG,349,"349 v New Zealand, 28/02/2021"
5,6,Jess Jonassen,AUS,301,"308 v West Indies, 11/09/2019"
6,7,Sophie Devine,NZ,274,"305 v Australia, 05/10/2020"
7,8,Dane van Niekerk,SA,252,"421 v Sri Lanka, 11/02/2019"
8,9,Katherine Brunt,ENG,236,"270 v Sri Lanka, 16/03/2019"
9,10,Ashleigh Gardner,AUS,223,"241 v West Indies, 11/09/2019"


### 7. Write a python program to scrape details of all the mobile phones under Rs. 20,000 listed on Amazon.in. The scraped data should include Product Name, Price, Image URL and Average Rating.

In [88]:
page = requests.get("https://www.amazon.in/s?k=mobile+phone+under+20000+rupees&crid=35G64V34QTZNL&sprefix=mobile+phone+under+20000%2Caps%2C295&ref=nb_sb_ss_ts-doa-p_1_24")
page

<Response [200]>

In [89]:
page.content

b'<!doctype html><html lang="en-in" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>\n\n<!-- sp:feature:cs-optimization -->\n<meta http-equiv=\'x-dns-prefetch-control\' content=\'on\'>\n<link rel="dns-prefetch" href="https://images-eu.ssl-images-amazon.com">\n<link rel="dns-prefetch" href="https://m.media-amazon.com">\n<link rel="dns-prefetch" href="https://completion.amazon.com">\n\n<!-- sp:feature:aui-assets -->\n<link rel="stylesheet" href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41DO8IyHTdL.css,31qGOnSAToL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11gKzVUTNZL.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,31q1y1irc5L.css,21j0IlW7xKL.css,01XPHJk60-L.css,014OeDQisGL.css,21aPhFy+riL.css,11gneA3MtJL.css,21fecG8pUzL.css,01RddH8vm-L.css,01CFUgsA-Y

In [90]:
soup = BeautifulSoup(page.content,"html.parser")
soup

<!DOCTYPE html>
<html class="a-no-js" data-19ax5a9jf="dingo" lang="en-in"><!-- sp:feature:head-start -->
<head><script>var aPageStart = (new Date()).getTime();</script><meta charset="utf-8"/>
<!-- sp:feature:cs-optimization -->
<meta content="on" http-equiv="x-dns-prefetch-control"/>
<link href="https://images-eu.ssl-images-amazon.com" rel="dns-prefetch"/>
<link href="https://m.media-amazon.com" rel="dns-prefetch"/>
<link href="https://completion.amazon.com" rel="dns-prefetch"/>
<!-- sp:feature:aui-assets -->
<link href="https://images-eu.ssl-images-amazon.com/images/I/11EIQ5IGqaL._RC|01ZTHTZObnL.css,41DO8IyHTdL.css,31qGOnSAToL.css,013z33uKh2L.css,017DsKjNQJL.css,0131vqwP5UL.css,41EWOOlBJ9L.css,11gKzVUTNZL.css,01ElnPiDxWL.css,11bGSgD5pDL.css,01Dm5eKVxwL.css,01IdKcBuAdL.css,01y-XAlI+2L.css,21N4kUH7pxL.css,01oDR3IULNL.css,31q1y1irc5L.css,21j0IlW7xKL.css,01XPHJk60-L.css,014OeDQisGL.css,21aPhFy+riL.css,11gneA3MtJL.css,21fecG8pUzL.css,01RddH8vm-L.css,01CFUgsA-YL.css,31C80IiXalL.css,11qour3N

In [91]:
name = soup.find_all('span',class_="a-size-medium a-color-base a-text-normal")
print(name)

price = soup.find_all('span',class_="a-price-whole")
print(price)

image_url = soup.find_all('div',class_='a-section aok-relative s-image-fixed-height')
print(image_url)

average_rating = soup.find_all('span',class_="a-icon-alt")
print(average_rating)

[<span class="a-size-medium a-color-base a-text-normal" dir="auto">Redmi 9A (Nature Green, 2GB Ram, 32GB Storage) | 2GHz Octa-core Helio G25 Processor</span>, <span class="a-size-medium a-color-base a-text-normal" dir="auto">Redmi 9 (Sky Blue, 4GB RAM, 64GB Storage)</span>, <span class="a-size-medium a-color-base a-text-normal" dir="auto">Samsung Galaxy M31 (Ocean Blue, 6GB RAM, 128GB Storage)</span>, <span class="a-size-medium a-color-base a-text-normal" dir="auto">Redmi 9A (Sea Blue, 2GB Ram, 32GB Storage) | 2GHz Octa-core Helio G25 Processor</span>, <span class="a-size-medium a-color-base a-text-normal" dir="auto">Redmi 9A (Sea Blue, 3GB Ram, 32GB Storage) | 2GHz Octa-core Helio G25 Processor</span>, <span class="a-size-medium a-color-base a-text-normal" dir="auto">Redmi 9 Prime (Space Blue, 4GB RAM, 64GB Storage)- Full HD+ Display &amp; AI Quad Camera</span>, <span class="a-size-medium a-color-base a-text-normal" dir="auto">Samsung Galaxy M01 Core (Black, 2GB RAM, 32GB Storage) wit

In [92]:
product_name = []
for i in name:
    product_name.append(i.text)
print(product_name)

m_price = []
for j in price:
    m_price.append(j.text)
print(m_price)

image = []
for k in image_url:
    for y in k.find_all('img'):
        image.append(y.attrs['src'])
print(image)

rating =[]
for l in average_rating:
    rating.append(l.text.replace(" out of 5 stars",""))
print(rating)

['Redmi 9A (Nature Green, 2GB Ram, 32GB Storage) | 2GHz Octa-core Helio G25 Processor', 'Redmi 9 (Sky Blue, 4GB RAM, 64GB Storage)', 'Samsung Galaxy M31 (Ocean Blue, 6GB RAM, 128GB Storage)', 'Redmi 9A (Sea Blue, 2GB Ram, 32GB Storage) | 2GHz Octa-core Helio G25 Processor', 'Redmi 9A (Sea Blue, 3GB Ram, 32GB Storage) | 2GHz Octa-core Helio G25 Processor', 'Redmi 9 Prime (Space Blue, 4GB RAM, 64GB Storage)- Full HD+ Display & AI Quad Camera', 'Samsung Galaxy M01 Core (Black, 2GB RAM, 32GB Storage) with No Cost EMI/Additional Exchange Offers', 'Redmi Note 9 (Pebble Grey, 4GB RAM 64GB Storage) - 48MP Quad Camera & Full HD+ Display | Extra Upto INR 1000 Off on Exchange', 'Oppo A31 (Mystery Black, 6GB RAM, 128GB Storage) with No Cost EMI/Additional Exchange Offers', 'Samsung Galaxy M12 (Blue,6GB RAM, 128GB Storage) 6000 mAh with 8nm Processor | True 48 MP Quad Camera | 90Hz Refresh Rate', 'Samsung Galaxy M12 (Blue,4GB RAM, 64GB Storage) 6000 mAh with 8nm Processor | True 48 MP Quad Camera |

In [93]:
rating.insert(8,'4.2')

In [94]:
del rating[16:]

In [95]:
Product_Name=[]
for name in product_name:
    Product_Name.append(name.split('(')[0])
Product_Name

['Redmi 9A ',
 'Redmi 9 ',
 'Samsung Galaxy M31 ',
 'Redmi 9A ',
 'Redmi 9A ',
 'Redmi 9 Prime ',
 'Samsung Galaxy M01 Core ',
 'Redmi Note 9 ',
 'Oppo A31 ',
 'Samsung Galaxy M12 ',
 'Samsung Galaxy M12 ',
 'Redmi Note 9 Pro Max ',
 'Samsung Galaxy M31s ',
 'Samsung Galaxy M12 ',
 'Samsung Galaxy M12 ',
 'Redmi 9A ']

In [96]:
df = pd.DataFrame({})
df['Product_name'] = Product_Name
df['Price'] = m_price
df['Image_URL'] = image
df['Average_rating'] = rating
df

Unnamed: 0,Product_name,Price,Image_URL,Average_rating
0,Redmi 9A,6799,https://m.media-amazon.com/images/I/71hDPUM7VP...,4.2
1,Redmi 9,8799,https://m.media-amazon.com/images/I/71A9Vo1Bat...,4.1
2,Samsung Galaxy M31,16499,https://m.media-amazon.com/images/I/71-Su4Wr0H...,4.2
3,Redmi 9A,6799,https://m.media-amazon.com/images/I/71hDPUM7VP...,4.2
4,Redmi 9A,7499,https://m.media-amazon.com/images/I/71hDPUM7VP...,4.2
5,Redmi 9 Prime,9499,https://m.media-amazon.com/images/I/71TwXw2LQR...,4.2
6,Samsung Galaxy M01 Core,6199,https://m.media-amazon.com/images/I/71AYb2AGHX...,3.5
7,Redmi Note 9,10999,https://m.media-amazon.com/images/I/716tvc8pVE...,4.2
8,Oppo A31,11990,https://m.media-amazon.com/images/I/71KCwNV6Mu...,4.2
9,Samsung Galaxy M12,13499,https://m.media-amazon.com/images/I/71yYaNztZ0...,4.1


### 8. Write a python program to extract information about the local weather from the National Weather Service website of USA, https://www.weather.gov/ for the city, San Francisco. You need to extract data about 7 day extended forecast display for the city. The data should include period, short description, temperature and description.

In [97]:
page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.777120000000025&lon=-122.41963999999996#.YFg8Da8zbIV")
page

<Response [200]>

In [98]:
page.content



In [99]:
soup = BeautifulSoup(page.content,'html.parser')
soup

<!DOCTYPE html>

<html class="no-js">
<head>
<!-- Meta -->
<meta content="width=device-width" name="viewport"/>
<link href="http://purl.org/dc/elements/1.1/" rel="schema.DC"/><title>National Weather Service</title><meta content="National Weather Service" name="DC.title"><meta content="NOAA National Weather Service National Weather Service" name="DC.description"/><meta content="US Department of Commerce, NOAA, National Weather Service" name="DC.creator"/><meta content="" name="DC.date.created" scheme="ISO8601"/><meta content="EN-US" name="DC.language" scheme="DCTERMS.RFC1766"/><meta content="weather, National Weather Service" name="DC.keywords"/><meta content="NOAA's National Weather Service" name="DC.publisher"/><meta content="National Weather Service" name="DC.contributor"/><meta content="http://www.weather.gov/disclaimer.php" name="DC.rights"/><meta content="General" name="rating"/><meta content="index,follow" name="robots"/>
<!-- Icons -->
<link href="./images/favicon.ico" rel="shor

In [100]:
period = soup.find_all('div',class_="col-sm-2 forecast-label")
period

temp_and_des = soup.find_all('div',class_="col-sm-10 forecast-text")
temp_and_des

[<div class="col-sm-10 forecast-text">Mostly clear, with a low around 46. West wind 5 to 7 mph. </div>,
 <div class="col-sm-10 forecast-text">Sunny, with a high near 62. Light northwest wind becoming west 16 to 21 mph in the afternoon. Winds could gust as high as 26 mph. </div>,
 <div class="col-sm-10 forecast-text">Mostly clear, with a low around 47. West northwest wind 8 to 15 mph, with gusts as high as 20 mph. </div>,
 <div class="col-sm-10 forecast-text">Sunny, with a high near 66. North wind 6 to 16 mph, with gusts as high as 24 mph. </div>,
 <div class="col-sm-10 forecast-text">Clear, with a low around 49. North wind 14 to 17 mph, with gusts as high as 36 mph. </div>,
 <div class="col-sm-10 forecast-text">Sunny, with a high near 67.</div>,
 <div class="col-sm-10 forecast-text">Mostly clear, with a low around 48.</div>,
 <div class="col-sm-10 forecast-text">Sunny, with a high near 63.</div>,
 <div class="col-sm-10 forecast-text">Clear, with a low around 47.</div>,
 <div class="col

In [101]:
Period = []
for i in period:
    Period.append(i.text)
print(Period)

description = []
for j in temp_and_des:
    description.append(j.text)
print(description)

['Overnight', 'Monday', 'Monday Night', 'Tuesday', 'Tuesday Night', 'Wednesday', 'Wednesday Night', 'Thursday', 'Thursday Night', 'Friday', 'Friday Night', 'Saturday', 'Saturday Night', 'Sunday']
['Mostly clear, with a low around 46. West wind 5 to 7 mph. ', 'Sunny, with a high near 62. Light northwest wind becoming west 16 to 21 mph in the afternoon. Winds could gust as high as 26 mph. ', 'Mostly clear, with a low around 47. West northwest wind 8 to 15 mph, with gusts as high as 20 mph. ', 'Sunny, with a high near 66. North wind 6 to 16 mph, with gusts as high as 24 mph. ', 'Clear, with a low around 49. North wind 14 to 17 mph, with gusts as high as 36 mph. ', 'Sunny, with a high near 67.', 'Mostly clear, with a low around 48.', 'Sunny, with a high near 63.', 'Clear, with a low around 47.', 'Sunny, with a high near 65.', 'Mostly clear, with a low around 48.', 'Sunny, with a high near 68.', 'Mostly clear, with a low around 48.', 'Sunny, with a high near 68.']


In [102]:
short_des = []
for k in description:
    short_des.append(k.split(",")[0])
print(short_des)

['Mostly clear', 'Sunny', 'Mostly clear', 'Sunny', 'Clear', 'Sunny', 'Mostly clear', 'Sunny', 'Clear', 'Sunny', 'Mostly clear', 'Sunny', 'Mostly clear', 'Sunny']


In [103]:
df = pd.DataFrame({})
df["Period from 21/3/2021 to 28/3/2021"] = Period
df["Short description"] = short_des
df["Temperature and description"] = description
df

Unnamed: 0,Period from 21/3/2021 to 28/3/2021,Short description,Temperature and description
0,Overnight,Mostly clear,"Mostly clear, with a low around 46. West wind ..."
1,Monday,Sunny,"Sunny, with a high near 62. Light northwest wi..."
2,Monday Night,Mostly clear,"Mostly clear, with a low around 47. West north..."
3,Tuesday,Sunny,"Sunny, with a high near 66. North wind 6 to 16..."
4,Tuesday Night,Clear,"Clear, with a low around 49. North wind 14 to ..."
5,Wednesday,Sunny,"Sunny, with a high near 67."
6,Wednesday Night,Mostly clear,"Mostly clear, with a low around 48."
7,Thursday,Sunny,"Sunny, with a high near 63."
8,Thursday Night,Clear,"Clear, with a low around 47."
9,Friday,Sunny,"Sunny, with a high near 65."
