# Web Scraping Project

# Scrape all the header tags from https://en.wikipedia.org/wiki/Main_Page

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}

In [3]:
req = requests.get('https://en.wikipedia.org/wiki/Main_Page', headers = headers)

In [4]:
print(req)

<Response [200]>


In [5]:
content = BeautifulSoup(req.content, 'html.parser')

In [6]:
print(content)

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"34c99fd1-3db8-43ca-8562-e19adb0192ca","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Main_Page","wgTitle":"Main Page","wgCurRevisionId":1004593520,"wgRevisionId":1004593520,"wgArticleId":15580374,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Main_Page","wgRelevantArticleId":15580374,"wgIsProbablyEditable":!1,"wgRelevantPag

let us now extract all the header tags from this link

In [7]:
h1 = content.find('h1', attrs={'class':'firstHeading'})

In [8]:
Heading1 = [h1.text]
print(Heading1)

['Main Page']


In [9]:
h2 = content.find_all('h2', attrs={'class':'mp-h2'})

In [10]:
Heading2 = []
for i in h2:
    Heading2.append(i.text)
print(Heading2)

["From today's featured article", 'Did you know\xa0...', 'In the news', 'On this day', "Today's featured picture", 'Other areas of Wikipedia', "Wikipedia's sister projects", 'Wikipedia languages']


In [11]:
h3 = content.find_all('h3', attrs = {'class':'vector-menu-heading'})

In [12]:
heading3 = []
for i in h3:
    heading3.append(i.text)

In [13]:
Heading1

['Main Page']

In [14]:
Heading2

["From today's featured article",
 'Did you know\xa0...',
 'In the news',
 'On this day',
 "Today's featured picture",
 'Other areas of Wikipedia',
 "Wikipedia's sister projects",
 'Wikipedia languages']

In [15]:
heading3

['\nPersonal tools\n',
 '\nNamespaces\n',
 '\nVariants\n',
 '\nViews\n',
 '\nMore\n',
 '\nNavigation\n',
 '\nContribute\n',
 '\nTools\n',
 '\nPrint/export\n',
 '\nIn other projects\n',
 '\nLanguages\n']

In [16]:
Heading3 = []
for i in heading3:
    Heading3.append(i.strip())

In [17]:
data = {
    'heading1':Heading1,
    'Heading2':Heading2,
    'Heading3':Heading3
}

In [18]:
AllHeadings = pd.DataFrame.from_dict(data, orient='index')

In [19]:
AllHeadings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
heading1,Main Page,,,,,,,,,,
Heading2,From today's featured article,Did you know ...,In the news,On this day,Today's featured picture,Other areas of Wikipedia,Wikipedia's sister projects,Wikipedia languages,,,
Heading3,Personal tools,Namespaces,Variants,Views,More,Navigation,Contribute,Tools,Print/export,In other projects,Languages


In [20]:
Headers = AllHeadings.transpose()

In [21]:
Headers.to_csv('WikiHeaders.csv', encoding='utf-8', header='WikiHeaders')

# Question 2) IMDB’s Top rated 100 movies’ data (i.e. Name, IMDB rating, Year of release)

In [22]:
req = requests.get('https://www.imdb.com/chart/top/?sort=ir,desc&mode=simple&page=1', headers = headers)

In [23]:
req

<Response [200]>

In [24]:
content = BeautifulSoup(req.content, 'html.parser')

In [25]:
content


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<script type="text/javascript">var ue_t0=ue_t0||+new Date();</script>
<script type="text/javascript">
window.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;
if (window.ue_ihb === 1) {

var ue_csm = window,
    ue_hob = +new Date();
(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){return+new Date};e.d=function(b){return f()-(b?0:d.ue_t0)};e.stub=function(b,a){if(!b[a]){var c=[];b[a]=function(){c.push([c.slice.call(arguments),e.d(),d.ue_id])};b[a].replay=function(b){for(var a;a=c.shift();)b(a[0],a[1],a[2])};b[a].isStub=1}};e.exec=function(b,a){return function(){try{return b.apply(this,arguments)}catch(c){ueLogError(c,{attribution:a||"undefined",logLevel:"WARN"})}}}})(ue_csm);


    var ue_err_chan = 'jserr';
(function(d,e){function h(f,b){if(!(a.ec>a.mxe)&&f){a.ter.push(f);b=b||{};var c=f.logLevel||b.logLevel;c&&c!==k&&c!==m&&c!==n&&c!==p||a.ec++;c&&c!=k||a.ecf++;b.pageURL

In [26]:
MovieName = content.find_all('td', attrs={'class':'titleColumn'})

In [27]:
Name = []
for i in MovieName:
    name = i.find('a')
    Name.append(name.text)

As we only want Top 100 Movies so we will use indexing

In [28]:
Name = Name[:100]

In [29]:
ratings = content.find_all('td', attrs={'class':'ratingColumn imdbRating'})

In [30]:
Ratings = []
for i in ratings:
    Ratings.append(i.text)

In [31]:
convertedratings = []
for i in Ratings:
    convertedratings.append(i.strip())

In [32]:
Ratings = convertedratings[:100]

In [33]:
yor = content.find_all('span', attrs={'class':'secondaryInfo'})

In [34]:
YearOfRelease = []
for i in yor:
    YearOfRelease.append(i.text)

In [35]:
YearOfRelease = YearOfRelease[:100]

In [36]:
top100 = {
    'Movie Name':Name,
    'Movie Rating':Ratings,
    'Movie Year Of Release:':YearOfRelease
}

In [37]:
Top100 = pd.DataFrame.from_dict(top100, orient='index')

In [38]:
Top100 = Top100.transpose()

In [39]:
Top100

Unnamed: 0,Movie Name,Movie Rating,Movie Year Of Release:
0,The Shawshank Redemption,9.2,(1994)
1,The Godfather,9.1,(1972)
2,The Godfather: Part II,9.0,(1974)
3,The Dark Knight,9.0,(2008)
4,12 Angry Men,8.9,(1957)
...,...,...,...
95,Citizen Kane,8.3,(1941)
96,Dangal,8.3,(2016)
97,Idi i smotri,8.2,(1985)
98,The Kid,8.2,(1921)


In [40]:
Top100.to_csv('Top 100 Movies IMDB.csv', encoding='utf-8', header='Top 100 Movies IMDB')

# Question 3) IMDB’s Top rated 100 Indian Movies’ data (i.e. Name, IMDB rating, Year of release)

In [41]:
req = requests.get('https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=4V817WWHFS0JDX08RXG7&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_india_tr_rhs_1', headers=headers)

In [42]:
req

<Response [200]>

In [43]:
content = BeautifulSoup(req.content, 'html.parser')

In [44]:
content


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<script type="text/javascript">var ue_t0=ue_t0||+new Date();</script>
<script type="text/javascript">
window.ue_ihb = (window.ue_ihb || window.ueinit || 0) + 1;
if (window.ue_ihb === 1) {

var ue_csm = window,
    ue_hob = +new Date();
(function(d){var e=d.ue=d.ue||{},f=Date.now||function(){return+new Date};e.d=function(b){return f()-(b?0:d.ue_t0)};e.stub=function(b,a){if(!b[a]){var c=[];b[a]=function(){c.push([c.slice.call(arguments),e.d(),d.ue_id])};b[a].replay=function(b){for(var a;a=c.shift();)b(a[0],a[1],a[2])};b[a].isStub=1}};e.exec=function(b,a){return function(){try{return b.apply(this,arguments)}catch(c){ueLogError(c,{attribution:a||"undefined",logLevel:"WARN"})}}}})(ue_csm);


    var ue_err_chan = 'jserr';
(function(d,e){function h(f,b){if(!(a.ec>a.mxe)&&f){a.ter.push(f);b=b||{};var c=f.logLevel||b.logLevel;c&&c!==k&&c!==m&&c!==n&&c!==p||a.ec++;c&&c!=k||a.ecf++;b.pageURL

In [45]:
MovieName = content.find_all('td', attrs={'class':'titleColumn'})

In [46]:
MovieName

[<td class="titleColumn">
       1.
       <a href="/title/tt0048473/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=690bec67-3bd7-45a1-9ab4-4f274a72e602&amp;pf_rd_r=ESDQTFY3Z00G2YDQTA0C&amp;pf_rd_s=center-4&amp;pf_rd_t=60601&amp;pf_rd_i=india.top-rated-indian-movies&amp;ref_=fea_india_ss_toprated_tt_1" title="Satyajit Ray (dir.), Kanu Bannerjee, Karuna Bannerjee">Pather Panchali</a>
 <span class="secondaryInfo">(1955)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0079221/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=690bec67-3bd7-45a1-9ab4-4f274a72e602&amp;pf_rd_r=ESDQTFY3Z00G2YDQTA0C&amp;pf_rd_s=center-4&amp;pf_rd_t=60601&amp;pf_rd_i=india.top-rated-indian-movies&amp;ref_=fea_india_ss_toprated_tt_2" title="Hrishikesh Mukherjee (dir.), Amol Palekar, Bindiya Goswami">Gol Maal</a>
 <span class="secondaryInfo">(1979)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0093603/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=690bec67-3bd7-45a1-9ab4-4f274a72e602&amp;pf_r

In [47]:
Name = []
for i in MovieName:
    name = i.find('a')
    Name.append(name.text)

In [48]:
Name = Name[:100]

In [49]:
ratings = content.find_all('td', attrs={'class':'ratingColumn imdbRating'})

In [50]:
ratings

[<td class="ratingColumn imdbRating">
 <strong title="8.5 based on 24,268 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 18,195 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 16,862 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 17,473 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.5 based on 12,431 user ratings">8.5</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 9,682 user ratings">8.4</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 5,511 user ratings">8.4</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 23,223 user ratings">8.4</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.4 based on 6,413 user ratings">8.4</strong>
 </td>,
 <td class="ratingColu

In [51]:
Ratings = []
for i in ratings:
    Ratings.append(i.text)

In [52]:
convertedratings = []
for i in Ratings:
    convertedratings.append(i.strip())

In [53]:
Ratings = convertedratings[:100]

In [54]:
yor = content.find_all('span', attrs={'class':'secondaryInfo'})

In [55]:
YearOfRelease = []
for i in yor:
    YearOfRelease.append(i.text)

In [56]:
YearOfRelease = YearOfRelease[:100]

In [57]:
top100 = {
    'Movie Name':Name,
    'Movie Rating':Ratings,
    'Movie Year Of Release:':YearOfRelease
}

In [58]:
Top100 = pd.DataFrame.from_dict(top100, orient='index')

In [59]:
Top100 = Top100.transpose()

In [60]:
Top100

Unnamed: 0,Movie Name,Movie Rating,Movie Year Of Release:
0,Pather Panchali,8.5,(1955)
1,Gol Maal,8.5,(1979)
2,Nayakan,8.5,(1987)
3,Anbe Sivam,8.5,(2003)
4,Apur Sansar,8.5,(1959)
...,...,...,...
95,The Legend of Bhagat Singh,8.0,(2002)
96,Barfi!,8.0,(2012)
97,Pink,8.0,(2016)
98,Bommarillu,8.0,(2006)


In [61]:
Top100.to_csv('Top 100 Indian Movies IMDB.csv', encoding='utf-8', header='Top 100 Indian Movies IMDB')

# Question4) Scrap book name, author name, genre and book review of any 5 books from ‘www.bookpage.com’

In [62]:
req = requests.get('https://bookpage.com/reviews', headers=headers)

In [63]:
req

<Response [200]>

In [64]:
content = BeautifulSoup(req.content, 'html.parser')

In [65]:
content

<!DOCTYPE html>

<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<title>Book Reviews | BookPage</title>
<meta content="Book recommendations of the best new books by genre and more." name="description"/>
<meta content="book reviews, books and literature, writing and writers" name="keywords"/>
<link href="https://bookpage.com/reviews" rel="canonical"/>
<meta content="BookPage.com" property="og:site_name"/>
<meta content="Book Reviews" property="og:title"/>
<meta content="Book recommendations of the best new books by genre and more." property="og:description"/>
<meta content="website" property="og:type"/>
<meta content="https://bookpage.com/reviews" property="og:url"/>
<meta content="//www.bookpage.com/default_image.jpg" property="og:image"/>
<meta content="summary" name="twitter:card"/>
<meta content="@bookpage" name="twitter:site"/>
<meta content="@bookpage" name="twitter:creator"/>
<meta content="index,follow" name="r

In [66]:
name = content.find_all('h4', attrs={'class':'italic'})

In [67]:
Name = []
for i in name:
    Name.append(i.text)

In [68]:
conv = []
for i in Name:
    conv.append(i.strip())

In [69]:
conv

['★ Secrets of Happiness',
 'Olympus, Texas',
 'From Little Tokyo, With Love',
 'Seed to Dust',
 'Before I Saw You',
 'Many Shapes of Clay',
 'The Girl Who Died',
 '★ Sorrowland',
 '★ The Secret to Superhuman Strength',
 'Hour of the Witch']

We have to scrape only 5 books

In [70]:
Name = conv[:5]

In [71]:
author = content.find_all('p',attrs={'class':'sans bold'})

In [72]:
Author = []
for i in author:
    Author.append(i.text)

In [73]:
AuthorName = []
for i in Author:
    AuthorName.append(i.strip())

In [74]:
Author = AuthorName[:5]

In [75]:
genre = content.find_all('p', attrs={'class':'genre-links hidden-phone'})

In [76]:
Genre = []
for i in genre:
    Genre.append(i.text)

In [77]:
genre = []
for i in Genre:
    genre.append(i.replace('\n',' '))

In [78]:
genre

[' Fiction  /  Family Drama ',
 ' Fiction  /  Family Drama ',
 ' YA  /  YA Fiction ',
 ' Nonfiction  /  Memoir  /  Nature ',
 ' Romance  /  Contemporary Romance ',
 " Children's  /  Children's Picture Book ",
 ' Mystery & Suspense  /  Mystery ',
 ' Fiction  /  Gothic Fiction ',
 ' Nonfiction  /  Memoir  /  Graphic Memoirs ',
 ' Mystery & Suspense  /  Suspense ']

In [79]:
Genre = genre[:5]

In [80]:
review = content.find_all('p',attrs = {'class':'excerpt'})

In [81]:
Review =[]
for i in review:
    Review.append(i.text)

In [82]:
review = []
for i in Review:
    review.append(i.strip())

In [83]:
Review = review[:5]

In [84]:
books = {
    'Book Name':Name,
    'Author Name':Author,
    'Book Genre':Genre,
    'Book Review':Review
}

In [85]:
Books = pd.DataFrame.from_dict(books, orient='index')

In [86]:
Books = Books.transpose()

In [87]:
Books.to_csv('Books.csv', encoding='utf-8',header='Books')

# Question 5) scrape cricket rankings from ‘www.icc-cricket.com’

### i) To Scrape Top 10ODI teams in men’s cricket along with the records for matches, points and rating.

In [88]:
req = requests.get('https://www.icc-cricket.com/rankings/mens/team-rankings/odi', headers=headers)

In [89]:
content = BeautifulSoup(req.content, 'html.parser')

In [90]:
team = content.find_all('span', attrs={'class':'u-hide-phablet'})

In [91]:
TeamName = []
for i in team:
    TeamName.append(i.text)

We have to find out top 10 ODI teams

In [92]:
TeamName = TeamName[:10]

In [93]:
matches = content.find_all('td', attrs={'class':'table-body__cell u-center-text'})

In [94]:
Matches = []
for i in matches:
    Matches.append(i.text)

In [95]:
Matches = Matches[0:40:2]

In [96]:
m = content.find_all('td', attrs={'class':'rankings-block__banner--matches'})

In [97]:
M = []
for i in m:
    M.append(i.text)

In [98]:
M

['17']

In [99]:
points = content.find_all('td', attrs={'class':'table-body__cell u-center-text'})

In [100]:
Points = []
for i in points:
    Points.append(i.text)

In [101]:
Points = Points[1::2]

In [102]:
p = content.find_all('td',attrs={'class':'rankings-block__banner--points'})

In [103]:
P = []
for i in p:
    P.append(i.text)

In [104]:
rating = content.find_all('td',attrs={'class':'table-body__cell u-text-right rating'})
Rating = []
for i in rating:
    Rating.append(i.text)

In [105]:
r = content.find_all('td', attrs={'class':'rankings-block__banner--rating u-text-right'})
R = []
for i in r:
    R.append(i.text.strip())

In [106]:
R

['121']

In [107]:
TeamName = TeamName[1:]

In [108]:
t = content.find_all('span',attrs={'class':'u-hide-phablet'})
t = t[0]

In [109]:
T = []
T.append(t.text)

In [110]:
l1 = {
    'Team Name':TeamName,
    'Matches':Matches,
    'Points': Points,
    'Rating':Rating
}
L1 = pd.DataFrame.from_dict(l1, orient='index')
L1 = L1.transpose()

In [111]:
l2 = {
    'Team Name':T,
    'Matches':M,
    'Points':P,
    'Rating':R
}
L2 = pd.DataFrame.from_dict(l2, orient='index')
L2 = L2.transpose()

In [112]:
l3 = [L2, L1]
TOP10 = pd.concat(l3)

In [113]:
TOP10.index = np.arange(1, len(TOP10) + 1)

In [114]:
TOP10 = TOP10[:10]

In [115]:
TOP10.to_csv('Top10 ODI Teams.csv', encoding='utf-8', header='Top 10 ODI Teams' )

In [116]:
TOP10

Unnamed: 0,Team Name,Matches,Points,Rating
1,New Zealand,17,2054,121
2,Australia,25,2945,118
3,India,29,3344,115
4,England,27,3100,115
5,South Africa,20,2137,107
6,Pakistan,24,2323,97
7,Bangladesh,24,2157,90
8,West Indies,27,2222,82
9,Sri Lanka,21,1652,79
10,Afghanistan,17,1054,62


### ii) Top 10 ODI Batsmen in men along with the records of their team and rating.

In [117]:
req = requests.get('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting', headers=headers)
content = BeautifulSoup(req.content, 'html.parser')

In [118]:
name = content.find_all('td', attrs={'class':'table-body__cell rankings-table__name name'})

In [119]:
Name = []
for i in name:
    Name.append(i.text.strip())

In [120]:
team = content.find_all('span', attrs={'class':'table-body__logo-text'})
Team = []
for i in team:
    Team.append(i.text)

In [121]:
rating = content.find_all('td', attrs={'class':'table-body__cell rating'})
Rating = []
for i in rating:
    Rating.append(i.text)

In [122]:
career = content.find_all('td', attrs={'class':'table-body__cell u-text-right u-hide-phablet'})
Career = []
for i in career:
    Career.append(i.text.strip())

In [123]:
n = content.find_all('div',attrs={'class':'rankings-block__banner--name-large'})
N = []
for i in n:
    N.append(i.text)

In [124]:
t = content.find_all('div', attrs={'rankings-block__banner--nationality'})
T = []
for i in t:
    T.append(i.text.strip())

In [125]:
r = content.find_all('div', attrs={'class':'rankings-block__banner--rating'})
R = []
for i in r:
    R.append(i.text)

In [126]:
c = content.find_all('span', attrs={'class':'rankings-block__career-best-text'})
C = []
for i in c:
    C.append(i.text.strip())

In [127]:
l1 = {
    'Player Name':Name,
    'Team Name':Team,
    'Rating':Rating,
    'Career Best Score':Career
}
L1 = pd.DataFrame.from_dict(l1, orient='index')
L1 = L1.transpose()

In [128]:
l2 = {
    'Player Name':N,
    'Team Name':T,
    'Rating':R,
    'Career Best Score':C
}
L2 = pd.DataFrame.from_dict(l2, orient='index')
L2 = L2.transpose()

In [129]:
l3 = [L2, L1]
Top10Bastmen = pd.concat(l3)
Top10Bastmen = Top10Bastmen[:10]

In [130]:
Top10Bastmen.index = np.arange(1, len(TOP10) + 1)

In [131]:
Top10Bastmen.to_csv('Top 10 ODI Batsmen.csv', encoding='utf-8', header='Mens Cricket')
Top10Bastmen

Unnamed: 0,Player Name,Team Name,Rating,Career Best Score
1,Babar Azam,PAK,865,"865 v South Africa, 07/04/2021"
2,Virat Kohli,IND,857,"911 v England, 12/07/2018"
3,Rohit Sharma,IND,825,"885 v Sri Lanka, 06/07/2019"
4,Ross Taylor,NZ,801,"841 v Bangladesh, 05/06/2019"
5,Aaron Finch,AUS,791,"798 v England, 25/06/2019"
6,Jonny Bairstow,ENG,785,"796 v India, 26/03/2021"
7,Fakhar Zaman,PAK,778,"778 v South Africa, 07/04/2021"
8,Francois du Plessis,SA,778,"820 v Australia, 06/07/2019"
9,David Warner,AUS,773,"880 v Pakistan, 26/01/2017"
10,Shai Hope,WI,773,"808 v Bangladesh, 17/05/2019"


### iii) Top 10 ODI bowlers along with the records of their team and rating.

In [132]:
req = requests.get('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling', headers=headers)
content = BeautifulSoup(req.content, 'html.parser')

In [133]:
name = content.find_all('td', attrs={'class':'table-body__cell rankings-table__name name'})

In [134]:
Name = []
for i in name:
    Name.append(i.text.strip())

In [135]:
team = content.find_all('span', attrs={'class':'table-body__logo-text'})
Team = []
for i in team:
    Team.append(i.text)

In [136]:
rating = content.find_all('td', attrs={'class':'table-body__cell rating'})
Rating = []
for i in rating:
    Rating.append(i.text)

In [137]:
career = content.find_all('td', attrs={'class':'table-body__cell u-text-right u-hide-phablet'})
Career = []
for i in career:
    Career.append(i.text.strip())

In [138]:
n = content.find_all('div',attrs={'class':'rankings-block__banner--name-large'})
N = []
for i in n:
    N.append(i.text)

In [139]:
t = content.find_all('div', attrs={'rankings-block__banner--nationality'})
T = []
for i in t:
    T.append(i.text.strip())

In [140]:
r = content.find_all('div', attrs={'class':'rankings-block__banner--rating'})
R = []
for i in r:
    R.append(i.text)

In [141]:
c = content.find_all('span', attrs={'class':'rankings-block__career-best-text'})
C = []
for i in c:
    C.append(i.text.strip())

In [142]:
l1 = {
    'Player Name':Name,
    'Team Name':Team,
    'Rating':Rating,
    'Career Best Score':Career
}
L1 = pd.DataFrame.from_dict(l1, orient='index')
L1 = L1.transpose()

In [143]:
l2 = {
    'Player Name':N,
    'Team Name':T,
    'Rating':R,
    'Career Best Score':C
}
L2 = pd.DataFrame.from_dict(l2, orient='index')
L2 = L2.transpose()

In [144]:
l3 = [L2, L1]
Top10Bowler = pd.concat(l3)
Top10Bowler = Top10Bowler[:10]

In [145]:
Top10Bowler.index = np.arange(1, len(TOP10) + 1)

In [146]:
Top10Bowler.to_csv('Top 10 ODI Bowler.csv', encoding='utf-8', header='ODI Bowler')
Top10Bowler

Unnamed: 0,Player Name,Team Name,Rating,Career Best Score
1,Trent Boult,NZ,737,"770 v West Indies, 22/06/2019"
2,Mujeeb Ur Rahman,AFG,708,"712 v Ireland, 24/01/2021"
3,Matt Henry,NZ,691,"691 v Bangladesh, 26/03/2021"
4,Jasprit Bumrah,IND,690,"841 v West Indies, 01/11/2018"
5,Mehedi Hasan,BAN,668,"694 v West Indies, 25/01/2021"
6,Kagiso Rabada,SA,666,"724 v England, 29/05/2017"
7,Chris Woakes,ENG,665,"676 v New Zealand, 14/07/2019"
8,Josh Hazlewood,AUS,660,"733 v England, 26/01/2018"
9,Pat Cummins,AUS,646,"729 v Pakistan, 12/06/2019"
10,Mohammad Amir,PAK,638,"663 v Sri Lanka, 02/10/2019"


# Q6) Write a python program to scrape cricket rankings from ‘www.icc-cricket.com’. 

### i) Top 10 ODI teams in women’s cricket along with the records for matches, points and rating.

In [147]:
req = requests.get('https://www.icc-cricket.com/rankings/womens/team-rankings/odi', headers=headers)
content = BeautifulSoup(req.content, 'html.parser')

In [148]:
team = content.find_all('span',attrs={'u-hide-phablet'})

In [149]:
Team = []
for i in team:
    Team.append(i.text)

In [150]:
Team = Team[1:]

In [151]:
t = content.find_all('span',attrs={'u-hide-phablet'})
T = []
for i in t:
    T.append(i.text)

In [152]:
T = T[0]

In [153]:
matches = content.find_all('td', attrs={'class':'table-body__cell u-center-text'})
Matches = []
for i in matches:
    Matches.append(i.text)

In [154]:
Matches = Matches[0:18:2]

In [155]:
matches = content.find_all('td', attrs={'class':'table-body__cell u-center-text'})
Matches = []
for i in matches:
    Matches.append(i.text)

In [156]:
Points = []
Points = Matches[1::2]

In [157]:
Points

['2,828', '1,993', '2,226', '1,947', '1,025', '1,101', '306', '519', '25']

In [158]:
rating = content.find_all('td', attrs={'class':'table-body__cell u-text-right rating'})
Rating = []
for i in rating:
    Rating.append(i.text)

In [159]:
Rating

['118', '117', '111', '93', '85', '73', '61', '47', '13']

In [160]:
l1 = {
    'Team Name':Team,
    'Matches':Matches,
    'Points':Points,
    'Rating':Rating
}

In [161]:
l1 = pd.DataFrame.from_dict(l1, orient='index')

In [162]:
l1 = l1.transpose()

In [163]:
m = content.find_all('td', attrs={'class','rankings-block__banner--matches'})
M=[]
for i in m:
    M.append(i.text)

In [164]:
M = M[0]

In [165]:
p = content.find_all('td',attrs={'class':'rankings-block__banner--points'})
P = []
for i in p:
    P.append(i.text)

In [166]:
P = P[0]

In [167]:
r = content.find_all('td', attrs={'class':'rankings-block__banner--rating u-text-right'})
R = []
for i in r:
    R.append(i.text.strip())

In [168]:
R = R[0]

In [169]:
l2 = {
    'Team Name':T,
    'Matches':M,
    'Points':P,
    'Rating':R
}

In [170]:
l2 = pd.DataFrame.from_dict(l2, orient='index')

In [171]:
l2 = l2.transpose()

In [172]:
l3 = [l2, l1]
Top10WomensTeam = pd.concat(l3)
Top10WomensTeam = Top10WomensTeam[:10]

In [173]:
Top10WomensTeam.index = np.arange(1, len(TOP10) + 1)

In [174]:
Top10WomensTeam.to_csv('Top 10 Women Teams.csv', encoding='utf-8', header='Top 10 Womens ODI Team')

In [175]:
Top10WomensTeam

Unnamed: 0,Team Name,Matches,Points,Rating
1,Australia,18,2955,164
2,South Africa,24,2828,118
3,England,2828,1993,117
4,India,17,2226,111
5,New Zealand,1993,1947,93
6,West Indies,20,1025,85
7,Pakistan,2226,1101,73
8,Bangladesh,21,306,61
9,Sri Lanka,1947,519,47
10,Ireland,12,25,13


### ii) Top 10 women’s ODI players along with the records of their team and rating.

In [176]:
req = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting', headers=headers)
content = BeautifulSoup(req.content, 'html.parser')

In [177]:
name = content.find_all('td', attrs={'class':'table-body__cell rankings-table__name name'})

In [178]:
Name = []
for i in name:
    Name.append(i.text.strip())

In [179]:
team = content.find_all('span', attrs={'class':'table-body__logo-text'})
Team = []
for i in team:
    Team.append(i.text)

In [180]:
rating = content.find_all('td', attrs={'class':'table-body__cell rating'})
Rating = []
for i in rating:
    Rating.append(i.text)

In [181]:
career = content.find_all('td', attrs={'class':'table-body__cell u-text-right u-hide-phablet'})
Career = []
for i in career:
    Career.append(i.text.strip())

In [182]:
req = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting', headers=headers)
content = BeautifulSoup(req.content, 'html.parser')

In [183]:
n = content.find_all('div', attrs={'class':'rankings-block__banner--name-large'})
N = []
for i in n:
    N.append(i.text)

In [184]:
N

['Tammy Beaumont']

In [185]:
t = content.find_all('div', attrs={'class':'rankings-block__banner--nationality'})
T = []
for i in t:
    T.append(i.text.strip())

In [186]:
T

['ENG']

In [187]:
r = content.find_all('div', attrs={'class':'rankings-block__banner--rating'})
R = []
for i in r:
    R.append(i.text)

In [188]:
R

['765']

In [189]:
c = content.find_all('span', attrs={'class':'rankings-block__career-best-text'})
C = []
for i in c:
    C.append(i.text.strip())

In [190]:
l1 = {
    'Player Name': Name,
    'Team Name': Team,
    'Rating': Rating,
    'Career Best':Career
}
L1 = pd.DataFrame.from_dict(l1, orient='index')
L1 = L1.transpose()

In [191]:
l2 = {
    'Player Name': N,
    'Team Name': T,
    'Rating': R,
    'Career Best':C
}
L2 = pd.DataFrame.from_dict(l2, orient='index')
L2 = L2.transpose()

In [192]:
l3 = [L2, L1]
Top10Top10WomensPlayer = pd.concat(l3)
Top10Top10WomensPlayer = Top10Top10WomensPlayer[:10]

In [193]:
Top10WomensPlayer = pd.concat(l3)
Top10WomensPlayer = Top10WomensPlayer[:10]

In [194]:
Top10WomensPlayer.index = np.arange(1, len(TOP10) + 1)

In [195]:
Top10WomensPlayer.to_csv('Top10WomenPlayer.csv', encoding='utf-8', header='Top10WomensPlayer')

In [196]:
Top10WomensPlayer

Unnamed: 0,Player Name,Team Name,Rating,Career Best
1,Tammy Beaumont,ENG,765,"765 v New Zealand, 28/02/2021"
2,Lizelle Lee,SA,758,"773 v India, 14/03/2021"
3,Alyssa Healy,AUS,756,"756 v New Zealand, 10/04/2021"
4,Stafanie Taylor,WI,746,"765 v India, 02/03/2012"
5,Meg Lanning,AUS,723,"834 v New Zealand, 24/02/2016"
6,Amy Satterthwaite,NZ,715,"756 v Australia, 02/03/2017"
7,Smriti Mandhana,IND,710,"797 v England, 28/02/2019"
8,Mithali Raj,IND,709,"839 v Australia, 24/12/2004"
9,Natalie Sciver,ENG,685,"712 v India, 25/02/2019"
10,Laura Wolvaardt,SA,683,"725 v India, 07/03/2021"


### iii) Top 10 women’s ODI all-rounder along with the records of their team and rating.

In [197]:
req = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder', headers=headers)
content = BeautifulSoup(req.content, 'html.parser')

In [198]:
name = content.find_all('td', attrs={'class':'table-body__cell rankings-table__name name'})
Name = []
for i in name:
    Name.append(i.text.strip())

In [199]:
team = content.find_all('span', attrs={'class':'table-body__logo-text'})
Team = []
for i in team:
    Team.append(i.text)

In [200]:
rating = content.find_all('td', attrs={'class':'table-body__cell rating'})
Rating = []
for i in rating:
    Rating.append(i.text)

In [201]:
career = content.find_all('td', attrs={'class':'table-body__cell u-text-right u-hide-phablet'})
Career = []
for i in career:
    Career.append(i.text.strip())

In [202]:
n = content.find_all('div', attrs={'class':'rankings-block__banner--name-large'})
N = []
for i in n:
    N.append(i.text)

In [203]:
t = content.find_all('div', attrs={'class':'rankings-block__banner--nationality'})
T = []
for i in t:
    T.append(i.text.strip())

In [204]:
r = content.find_all('div', attrs={'class':'rankings-block__banner--rating'})
R = []
for i in r:
    R.append(i.text)

In [205]:
c = content.find_all('span', attrs={'class':'rankings-block__career-best-text'})
C = []
for i in c:
    C.append(i.text.strip())

In [206]:
l1 = {
    'Player Name': Name,
    'Team Name': Team,
    'Rating': Rating,
    'Career Best':Career
}
L1 = pd.DataFrame.from_dict(l1, orient='index')
L1 = L1.transpose()

In [207]:
l2 = {
    'Player Name': N,
    'Team Name': T,
    'Rating': R,
    'Career Best':C
}
L2 = pd.DataFrame.from_dict(l2, orient='index')
L2 = L2.transpose()

In [208]:
l3 = [L2, L1]
Top10Top10WomensAllRounder = pd.concat(l3)
Top10Top10WomensAllRounder = Top10Top10WomensAllRounder[:10]

In [209]:
Top10Top10WomensAllRounder.index = np.arange(1, len(TOP10) + 1)

In [210]:
Top10Top10WomensAllRounder.to_csv('Top10Top10WomensAllRounder.csv', encoding='utf-8', header='Top10Top10WomensAllRounder')

In [211]:
Top10Top10WomensAllRounder

Unnamed: 0,Player Name,Team Name,Rating,Career Best
1,Marizanne Kapp,SA,418,"418 v India, 17/03/2021"
2,Ellyse Perry,AUS,418,"548 v West Indies, 11/09/2019"
3,Stafanie Taylor,WI,410,"559 v New Zealand, 10/10/2013"
4,Natalie Sciver,ENG,349,"349 v New Zealand, 28/02/2021"
5,Deepti Sharma,IND,343,"397 v South Africa, 09/10/2019"
6,Jess Jonassen,AUS,307,"308 v West Indies, 11/09/2019"
7,Ashleigh Gardner,AUS,252,"256 v New Zealand, 04/04/2021"
8,Dane van Niekerk,SA,243,"421 v Sri Lanka, 11/02/2019"
9,Sophie Devine,NZ,242,"305 v Australia, 05/10/2020"
10,Amelia Kerr,NZ,236,"247 v Australia, 07/04/2021"


# Question 7) Scrape details of all the mobile phones under Rs. 20,000 listed on Amazon.in. The scraped data should include Product Name, Price, Image URL and Average Rating.

In [212]:
Name= []
Price = []
Image = []
Ratings = []

In [213]:
for i in np.arange(1,26):
    req = requests.get('https://www.amazon.in/s?i=electronics&bbn=1389401031&rh=n%3A1389401031%2Cp_36%3A100000-2000000%2Cp_89%3ARedmi%7CSamsung%7Crealme&dc&page={}&qid=1620061309&rnid=3837712031&ref=sr_pg_{}'.format(i, i), headers=headers)
    content = BeautifulSoup(req.content, 'html.parser')
    name = content.find_all('span', attrs='a-size-base-plus a-color-base a-text-normal')
    for i in name:
        Name.append(i.text)

In [214]:
for i in np.arange(1,26):
    req = requests.get('https://www.amazon.in/s?i=electronics&bbn=1389401031&rh=n%3A1389401031%2Cp_36%3A100000-2000000%2Cp_89%3ARedmi%7CSamsung%7Crealme&dc&page={}&qid=1620061309&rnid=3837712031&ref=sr_pg_{}'.format(i, i), headers=headers)
    content = BeautifulSoup(req.content, 'html.parser')
    price = content.find_all('span', attrs={'class':'a-offscreen'})
    for i in price:
        Price.append(i.text)

In [215]:
print(len(Name))
print(len(Price))

600
809


In [216]:
for i in np.arange(1,26):
    req = requests.get('https://www.amazon.in/s?i=electronics&bbn=1389401031&rh=n%3A1389401031%2Cp_36%3A100000-2000000%2Cp_89%3ARedmi%7CSamsung%7Crealme&dc&page={}&qid=1620061309&rnid=3837712031&ref=sr_pg_{}'.format(i, i), headers=headers)
    content = BeautifulSoup(req.content, 'html.parser')
    for i in content.find_all('img'):
        Image.append(i['src'])

In [217]:
Image

['//fls-eu.amazon.in/1/batch/1/OP/A21TJRUUN4KGV:262-6489788-0484831:GFZ7PEAY6F7D7H4H3EM7$uedata=s:%2Frd%2Fuedata%3Fstaticb%26id%3DGFZ7PEAY6F7D7H4H3EM7:0',
 'https://images-eu.ssl-images-amazon.com/images/G/31/gno/sprites/nav-sprite-global-1x-hm-dsk-reorg._CB405936311_.png',
 'https://images-eu.ssl-images-amazon.com/images/G/31/img18/Electronics/Megamenu/Megamenu_Electronics_top._CB485947327_.png',
 'https://m.media-amazon.com/images/I/71A9Vo1BatL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71-Su4Wr0HL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71OxJeyywSL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71IqJQM2stL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/716nHhG9SWL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71sxlhYhKWL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71yYaNztZ0L._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71sxlhYhKWL._AC_UL320_.jpg',
 'https://m.media-amazon.com/images/I/71Kmfs0T0BL._AC_UL320_.jp

In [218]:
len(Image)

725

In [219]:
for i in np.arange(1,26):
    req = requests.get('https://www.amazon.in/s?i=electronics&bbn=1389401031&rh=n%3A1389401031%2Cp_36%3A100000-2000000%2Cp_89%3ARedmi%7CSamsung%7Crealme&dc&page={}&qid=1620061309&rnid=3837712031&ref=sr_pg_{}'.format(i, i), headers=headers)
    content = BeautifulSoup(req.content, 'html.parser')
    ratings = content.find_all('span', attrs={'class':'a-icon-alt'})
    for i in ratings:
        Ratings.append(i.text)

In [220]:
len(Ratings)

607

In [221]:
data = {
    'Mobile Name':Name,
    'Price':Price,
    'Image Url':Image,
    'Ratings':Ratings
}

In [222]:
Mobile = pd.DataFrame.from_dict(data, orient='index')

In [223]:
mobile = Mobile.transpose()

In [224]:
mobile

Unnamed: 0,Mobile Name,Price,Image Url,Ratings
0,"Redmi 9 (Sky Blue, 4GB RAM, 64GB Storage)| 500...","₹8,499",//fls-eu.amazon.in/1/batch/1/OP/A21TJRUUN4KGV:...,4.2 out of 5 stars
1,"Samsung Galaxy M31 (Ocean Blue, 6GB RAM, 128GB...","₹10,999",https://images-eu.ssl-images-amazon.com/images...,4.3 out of 5 stars
2,"Samsung Galaxy M31 (Space Black, 6GB RAM, 128G...","₹13,999",https://images-eu.ssl-images-amazon.com/images...,4.3 out of 5 stars
3,"Redmi Note 10 (Shadow Black, 4GB RAM, 64GB Sto...","₹19,999",https://m.media-amazon.com/images/I/71A9Vo1Bat...,4.1 out of 5 stars
4,"Redmi 9 (Carbon Black, 4GB RAM, 64GB Storage) ...","₹13,999",https://m.media-amazon.com/images/I/71-Su4Wr0H...,4.2 out of 5 stars
...,...,...,...,...
804,,"₹3,499",,
805,,"₹2,990",,
806,,"₹3,990",,
807,,"₹4,940",,


In [225]:
mobile = mobile.dropna()

In [226]:
mobile.to_csv('Mobile Under 20000.csv', header='Mobile Under 20000', encoding='utf-8')

# Question 8) Scrape extract information about the local weather from the National Weather Service website of USA, https://www.weather.gov/ for the city, San Francisco. You need to extract data about 7 day extended forecast display for the city. The data should include period, short description, temperature and description.

In [227]:
req = requests.get('https://www.nws.noaa.gov/wtf/MapClick.php?CityName=San+Francisco&state=CA&site=MTR&lat=37.7989&lon=-122.398&lg=ep', headers=headers)

In [228]:
req

<Response [200]>

In [229]:
content = BeautifulSoup(req.content, 'html.parser')

In [230]:
short = content.find_all('td')

In [231]:
Short=[]
for i in content.find_all('img'):
        Short.append(i['alt'])

In [232]:
Short = Short[7:16]

In [233]:
period = content.find_all('td')

In [234]:
Period = []
for i in content.find_all('b'):
    Period.append(i.text)

In [235]:
Period = Period[5:14]

In [236]:
temp = content.find_all('font')

In [237]:
Temperature = []
for i in temp:
    Temperature.append(i.text)

In [238]:
desc = content.find_all('td')

In [239]:
Desc = []
for i in desc:
    Desc.append(i.text)

In [240]:
Desc = Desc[24:25]

In [241]:
Desc

['\n\nThis Afternoon: Sunny, with a high near 59. West southwest wind between 11 and 13 mph. Tonight: Partly cloudy, with a low around 51. West southwest wind between 5 and 9 mph. Thursday: Mostly sunny, with a high near 57. Light wind becoming west between 15 and 18 mph. Winds could gust as high as 24 mph.Thursday Night: Mostly clear, with a low around 52. West wind between 15 and 18 mph, with gusts as high as 24 mph. Friday: Sunny, with a high near 59. Calm wind becoming west between 11 and 14 mph. Winds could gust as high as 18 mph.Friday Night: Mostly clear, with a low around 52.  Saturday: Sunny, with a high near 65.  Saturday Night: Mostly clear, with a low around 54.  Sunday: Sunny, with a high near 64.  Sunday Night: Clear, with a low around 54.  Monday: Sunny, with a high near 66.  Monday Night: Mostly clear, with a low around 53.  Tuesday: Sunny, with a high near 63.  ']

In [242]:
import re

In [243]:
conv = []
for i in Desc:
    conv = re.split(': ', i)

In [244]:
conv

['\n\nThis Afternoon',
 'Sunny, with a high near 59. West southwest wind between 11 and 13 mph. Tonight',
 'Partly cloudy, with a low around 51. West southwest wind between 5 and 9 mph. Thursday',
 'Mostly sunny, with a high near 57. Light wind becoming west between 15 and 18 mph. Winds could gust as high as 24 mph.Thursday Night',
 'Mostly clear, with a low around 52. West wind between 15 and 18 mph, with gusts as high as 24 mph. Friday',
 'Sunny, with a high near 59. Calm wind becoming west between 11 and 14 mph. Winds could gust as high as 18 mph.Friday Night',
 'Mostly clear, with a low around 52.  Saturday',
 'Sunny, with a high near 65.  Saturday Night',
 'Mostly clear, with a low around 54.  Sunday',
 'Sunny, with a high near 64.  Sunday Night',
 'Clear, with a low around 54.  Monday',
 'Sunny, with a high near 66.  Monday Night',
 'Mostly clear, with a low around 53.  Tuesday',
 'Sunny, with a high near 63.  ']

In [245]:
len(conv)

14

In [246]:
Description = []
for i in conv:
    Description.append(re.split(',', i))

In [247]:
Description = Description[1:10]

In [248]:
Description

[['Sunny',
  ' with a high near 59. West southwest wind between 11 and 13 mph. Tonight'],
 ['Partly cloudy',
  ' with a low around 51. West southwest wind between 5 and 9 mph. Thursday'],
 ['Mostly sunny',
  ' with a high near 57. Light wind becoming west between 15 and 18 mph. Winds could gust as high as 24 mph.Thursday Night'],
 ['Mostly clear',
  ' with a low around 52. West wind between 15 and 18 mph',
  ' with gusts as high as 24 mph. Friday'],
 ['Sunny',
  ' with a high near 59. Calm wind becoming west between 11 and 14 mph. Winds could gust as high as 18 mph.Friday Night'],
 ['Mostly clear', ' with a low around 52.  Saturday'],
 ['Sunny', ' with a high near 65.  Saturday Night'],
 ['Mostly clear', ' with a low around 54.  Sunday'],
 ['Sunny', ' with a high near 64.  Sunday Night']]

In [249]:
data = {
    'Period':Period,
    'Short Description':Short,
    'Temperature':Temperature,
    'Long Description':Description
}

In [250]:
Forecast = pd.DataFrame.from_dict(data, orient='index')
Forecast = Forecast.transpose()

In [251]:
Forecast = Forecast.dropna()

In [252]:
Forecast.to_csv('Forecast.csv', encoding='utf-8', header='Weather Forecast')

In [253]:
Forecast

Unnamed: 0,Period,Short Description,Temperature,Long Description
0,ThisAfternoon,Sunny,59 °F,"[Sunny, with a high near 59. West southwest w..."
1,Tonight,Partly Cloudy,51 °F,"[Partly cloudy, with a low around 51. West so..."
2,Thursday,Mostly Sunny,57 °F,"[Mostly sunny, with a high near 57. Light win..."
3,ThursdayNight,Mostly Clear,52 °F,"[Mostly clear, with a low around 52. West win..."
4,Friday,Sunny,59 °F,"[Sunny, with a high near 59. Calm wind becomi..."
5,FridayNight,Mostly Clear,52 °F,"[Mostly clear, with a low around 52. Saturday]"
6,Saturday,Sunny,65 °F,"[Sunny, with a high near 65. Saturday Night]"
7,SaturdayNight,Mostly Clear,54 °F,"[Mostly clear, with a low around 54. Sunday]"
8,Sunday,Sunny,64 °F,"[Sunny, with a high near 64. Sunday Night]"
