#Importing the required Modules

In [143]:
from bs4 import BeautifulSoup
import requests
import re
from time import sleep
from random import randint
import pandas as pd
from requests import get

# IMDB top 250 data (Single Page Scraping)

##Scraping the movie name and rating using BeautifulSoup

In [144]:
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

In [145]:
movies = soup.select('td.titleColumn')
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]

##Extracting Name and Rating and adding it to lst (list)

In [146]:
lst = []

for index in range(0, len(movies)):


    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]     
    data = {"movie_title": movie_title,
            "rating": ratings[index],
            }
    lst.append(data)

##Adding the items into DataFrame

In [147]:
df = pd.DataFrame(columns=["Movie", "Rating"])
for movie in lst:
  df_length = len(df)
  df.loc[df_length] = movie['movie_title'], round(float(movie['rating']), 2)
df

Unnamed: 0,Movie,Rating
0,The Shawshank Redemption,9.22
1,The Godfather,9.15
2,The Godfather: Part II,8.98
3,The Dark Knight,8.97
4,12 Angry Men,8.94
...,...,...
245,Three Colors: Red,8.02
246,Neon Genesis Evangelion: The End of Evangelion,8.02
247,Drishyam,8.02
248,Sunrise,8.02


# Players and Ranking from Cricbuzz - Multiple Page Web Scraping

In [148]:
url = ['https://www.cricbuzz.com/cricket-stats/icc-rankings/men/batting', 'https://www.cricbuzz.com/cricket-stats/icc-rankings/men/bowling', 'https://www.cricbuzz.com/cricket-stats/icc-rankings/men/all-rounder']
response = requests.get(url[0])
soup = BeautifulSoup(response.text, 'html.parser')



## Adding all the batsman into a list (Test, ODI, T20)

In [149]:
batsman_containers = soup.find_all('div', class_ = 'cb-col cb-col-50 cb-lst-itm-sm text-left')

In [150]:
batsmens = []
for batsman in batsman_containers:
  batsmens.append(batsman.a.get_text())
len(batsmens)
batsmens[198]

'Dawid Malan'

Adding batsman to the DataFrame according to TEST, ODI and T20

In [151]:
df = pd.DataFrame(columns=['Test', 'ODI', 'T20'])

# adding the batsman according to the ranking into seperate list for each format

index = 0
test_batsman = []
odi_batsman = []
t20_batsman = []
while index < 98:
  test_batsman.append(batsmens[index])
  index += 1
test_batsman.append(None) # Since there are only ranking of 98 batsmen in test, to adjust the same number of rows with odi and t20 to insert into pandas
test_batsman.append(None)
while index < 198:
  odi_batsman.append(batsmens[index])
  index += 1
while index < 298:
  t20_batsman.append(batsmens[index])
  index += 1

len(test_batsman)

100

In [152]:
# adding all the list to the dataframe

Batsman = pd.DataFrame(list(zip(test_batsman, odi_batsman, t20_batsman)), columns=['Test', "ODI", 'T20'])
Batsman.head(10)

Unnamed: 0,Test,ODI,T20
0,Steven Smith,Babar Azam,Dawid Malan
1,Kane Williamson,Virat Kohli,Aaron Finch
2,Marnus Labuschagne,Rohit Sharma,Babar Azam
3,Virat Kohli,Ross Taylor,Devon Conway
4,Joe Root,Aaron Finch,Virat Kohli
5,Rishabh Pant,Jonny Bairstow,Rassie van der Dussen
6,Rohit Sharma,Fakhar Zaman,KL Rahul
7,Henry Nicholls,Faf du Plessis,Glenn Maxwell
8,David Warner,David Warner,Martin Guptill
9,Quinton de Kock,Shai Hope,Mohammad Rizwan


## Adding the Bowlers and All-rounders (together) [from multiple page]

In [153]:
# Bowlers

response1 = requests.get(url[1])
soup1 = BeautifulSoup(response1.text, 'html.parser')

# All-rounders

response2 = requests.get(url[2])
soup2 = BeautifulSoup(response2.text, 'html.parser')

In [156]:
bowler_containers = soup1.find_all('div', class_ = 'cb-col cb-col-67 cb-rank-plyr')
all_rounder_containers = soup2.find_all('div', class_ = 'cb-col cb-col-50 cb-lst-itm-sm text-left')

bowlers = []
all_rounders = []
for bowler in bowler_containers:
  bowlers.append(bowler.a.get_text())


for all_rounder in all_rounder_containers:
  all_rounders.append(all_rounder.a.get_text())

len(all_rounders)     # there are only 10 all rounders each in each format

30

Adding Bowlers and All rounders each according to the format into Pandas

For Bowlers

In [161]:
df = pd.DataFrame(columns=['Test', 'ODI', 'T20'])

# adding the batsman according to the ranking into seperate list for each format

index = 0
test_bowler = []
odi_bowler = []
t20_bowler = []
while index < 99:
  test_bowler.append(bowlers[index])
  index += 1
test_bowler.append(None) # Since there are only ranking of 98 batsmen in test, to adjust the same number of rows with odi and t20 to insert into pandas

while index < 199:
  odi_bowler.append(bowlers[index])
  index += 1
while index < 299:
  t20_bowler.append(bowlers[index])
  index += 1

len(test_bowler)

100

In [162]:
# adding all the list to the dataframe

Bowler = pd.DataFrame(list(zip(test_bowler, odi_bowler, t20_bowler)), columns=['Test', "ODI", 'T20'])
Bowler.head(10)

Unnamed: 0,Test,ODI,T20
0,Pat Cummins,Trent Boult,Tabraiz Shamsi
1,Ravichandran Ashwin,Mehidy Hasan,Rashid Khan
2,Tim Southee,Mujeeb Ur Rahman,Ashton Agar
3,Josh Hazlewood,Matt Henry,Adil Rashid
4,Neil Wagner,Jasprit Bumrah,Mujeeb Ur Rahman
5,Kagiso Rabada,Kagiso Rabada,Tim Southee
6,Stuart Broad,Chris Woakes,Adam Zampa
7,James Anderson,Josh Hazlewood,Ish Sodhi
8,Mitchell Starc,Pat Cummins,Lakshan Sandakan
9,Jason Holder,Mustafizur Rahman,Wanindu Hasaranga


For All-rounders

In [163]:
df = pd.DataFrame(columns=['Test', 'ODI', 'T20'])

# adding the batsman according to the ranking into seperate list for each format

index = 0
test_all_rounder = []
odi_all_rounder = []
t20_all_rounder = []
while index < 10:
  test_all_rounder.append(all_rounders[index])
  index += 1
# test_bowler.append(None) # Since there are only ranking of 98 batsmen in test, to adjust the same number of rows with odi and t20 to insert into pandas

while index < 20:
  odi_all_rounder.append(all_rounders[index])
  index += 1
while index < 30:
  t20_all_rounder.append(all_rounders[index])
  index += 1

len(test_all_rounder)

10

In [164]:
# adding all the list to the dataframe

All_Rounder = pd.DataFrame(list(zip(test_all_rounder, odi_all_rounder, t20_all_rounder)), columns=['Test', "ODI", 'T20'])
All_Rounder.head(10)

Unnamed: 0,Test,ODI,T20
0,Ravindra Jadeja,Shakib Al Hasan,Mohammad Nabi
1,Jason Holder,Ben Stokes,Shakib Al Hasan
2,Ben Stokes,Mohammad Nabi,Glenn Maxwell
3,Ravichandran Ashwin,Chris Woakes,Richie Berrington
4,Shakib Al Hasan,Rashid Khan,Gareth Delany
5,Kyle Jamieson,Mitchell Santner,Khawar Ali
6,Mitchell Starc,Imad Wasim,Sean Williams
7,Pat Cummins,Colin de Grandhomme,Collins Obuya
8,Colin de Grandhomme,Ravindra Jadeja,Rohan Mustafa
9,Chris Woakes,Sean Williams,Zeeshan Maqsood
