In [461]:
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import json
from furl import furl
from collections import defaultdict
import pandas as pd
import copy
import time

In [462]:
# URL to result pages of all the years of PSL
# You can replace these with URLS to some other series result pages and the script should worl
psl_urls = {
    "2016":"https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/match-results",
    "2017":"https://www.espncricinfo.com/series/psl-2016-17-1075974/match-results",
    "2018":"https://www.espncricinfo.com/series/psl-2017-18-1128817/match-results",
    "2019":"https://www.espncricinfo.com/series/psl-2018-19-1168814/match-results",
    "2020":"https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/match-results"
}

In [463]:
# Extracting all Data
start = time.time()
all_rows = []
for psl_year, url in psl_urls.items():
    psl_data = get_psl_data_for_year(psl_year, url)
    all_rows.extend(psl_data)
end = time.time()

print (f"Total Time Taken : {end-start}")
    

https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/islamabad-united-vs-quetta-gladiators-final-959221/live-cricket-score
https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/islamabad-united-vs-peshawar-zalmi-3rd-qualifying-final-959219/live-cricket-score
https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/islamabad-united-vs-karachi-kings-2nd-qualifying-final-959217/live-cricket-score
https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/peshawar-zalmi-vs-quetta-gladiators-1st-qualifying-final-959215/live-cricket-score
https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/islamabad-united-vs-lahore-qalandars-20th-match-959213/live-cricket-score
https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/karachi-kings-vs-peshawar-zalmi-19th-match-959211/live-cricket-score
https://www.espncricinfo.com/series/pakistan-super-league-2015-16-923069/lahore-qalandars-vs-quetta-gla

https://www.espncricinfo.com/series/psl-2017-18-1128817/multan-sultans-vs-karachi-kings-22nd-match-1128839/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/peshawar-zalmi-vs-islamabad-united-21st-match-1128838/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/multan-sultans-vs-lahore-qalandars-20th-match-1128837/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/karachi-kings-vs-quetta-gladiators-19th-match-1128836/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/islamabad-united-vs-lahore-qalandars-18th-match-1128835/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/multan-sultans-vs-quetta-gladiators-17th-match-1128834/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/peshawar-zalmi-vs-multan-sultans-16th-match-1128833/live-cricket-score
https://www.espncricinfo.com/series/psl-2017-18-1128817/islamabad-united-vs-karachi-kings-15th-matc

https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/multan-sultans-vs-peshawar-zalmi-27th-match-1211668/live-cricket-score
https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/karachi-kings-vs-lahore-qalandars-26th-match-1211667/live-cricket-score
https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/multan-sultans-vs-quetta-gladiators-25th-match-1211666/live-cricket-score
https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/lahore-qalandars-vs-peshawar-zalmi-24th-match-1211665/live-cricket-score
https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/lahore-qalandars-vs-karachi-kings-23rd-match-1211664/live-cricket-score
https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/islamabad-united-vs-multan-sultans-22nd-match-1211663/live-cricket-score
https://www.espncricinfo.com/series/psl-2019-20-2020-21-1211602/lahore-qalandars-vs-quetta-gladiators-21st-match-1211662/live-cricket-score
https://www.espncricinfo.com/series/psl-2

In [464]:
# Converting to Dataframe
df = pd.DataFrame(all_rows)

In [465]:
# Saving Dataframe
df.to_csv("psl.csv",index=False)

In [441]:
# For a given year of PSL, get all data
def get_psl_data_for_year(psl_year, url):
    url_list = get_all_url_for_psl_url(url)
    all_match_data = get_all_matches_data_from_url_list(url_list)
    with open (f"psl_{psl_year}.json","w")as f: 
        json.dump(all_match_data, f)
    print (f" JSON DUMP DATA LEN : {len(all_match_data)}")
    return get_row_list_from_all_match_data(all_match_data, psl_year)
    
    

In [442]:
# Givn a PSL result page, get URL for all matches
def get_all_url_for_psl_url(url):
    content = requests.get(url).text
    soup = BeautifulSoup(content)
    main_div = soup.find("div",{"class":"league-scores-container"}).find("div",{"class":"no-gutters"})
    url_list = []
    for div in main_div:
        wrong_url = div.find("a")["href"]
        parsed = urllib.parse.urlparse(wrong_url)
        replaced = parsed._replace(netloc="www.espncricinfo.com")
        url_list.append(replaced.geturl())
    return url_list
    

In [443]:
# Given a match URL, get all of its data
def get_all_matches_data_from_url_list(url_list):
    all_match_data =[]
    for match_url in url_list:
        print(match_url)
        url_split = match_url.split("/")
        series_id = url_split[4].split("-")[-1]
        match_id = url_split[5].split("-")[-1]
        match_data = defaultdict(int)
        team1, team2, result= get_winner_name(match_url)
        for inning_number in range(1,3):
            score_url =f"https://hs-consumer-api.espncricinfo.com/v1/pages/match/comments?seriesId={series_id}&matchId={match_id}&inningNumber={inning_number}&commentType=ALL"
            match_data[inning_number] =get_data_for_url(score_url)
        all_match_data.append({
            "team_1": team1,
            "team_2": team2,
            "result": result,
            "data": match_data
        })
    return all_match_data
    

In [444]:
# Get the winner name from the header
def get_winner_name(url):
    content = requests.get(url).text
    soup = BeautifulSoup(content)
    match_info = soup.find("div",{"class":"match-info-MATCH"})
    winner = match_info.find("div",{"class":"status-text"})
    teams = match_info.find("div",{"class":"teams"})
    team1, team2 = teams.find_all("a",{"class":"name-link"})
    team1= team1.text
    team2= team2.text
    if "abandoned" in winner.text:
        return team1, team2, "abondoned"
    elif "tied" in winner.text:
        return team1, team2, "tied"
    elif "No result" in winner.text:
        return team1, team2 , "no_result"
    else:
        return team1, team2, winner.text.split(" ")[0]

In [445]:
# Given a URL for an API call, get all of its Data
def get_data_for_url(url):
    overs = defaultdict(list)
    previous_over = 20
    next_over = "stuf"
    while (next_over):
        content = requests.get(url)
        next_over = content.json().get("nextInningOver")
        json_data = content.json()
        for comment in json_data.get("comments"):
            over= comment.get("overNumber")
            wicket_text = comment.get("dismissalText")
            if wicket_text :
                wicket_text = wicket_text.get("short")
            ball_obj = {
                "ball": comment.get("ballNumber"),
                "runs" : comment.get("totalRuns"),
                "is_four" : comment.get("isFour"),
                "is_six" : comment.get("isSix"),
                "is_wicket" : comment.get("isWicket"),
                "wicket": comment.get("dismissalType"),
                "wicket_text": wicket_text

            }
            overs[over].append(ball_obj)
        if f"fromInningOver={previous_over}" in url:
           
            url =url.replace(f"fromInningOver={previous_over}",f"fromInningOver={next_over}")
        else:
            url = f"{url}&fromInningOver={next_over}"
       
        previous_over=next_over
        
    return overs
    

In [458]:
# Given all Data, Generate the rows for the CSV from extracted data
def get_row_list_from_all_match_data(all_match_data, psl_year):
    all_rows = []
    for index , match in enumerate(reversed(all_match_data)):
        for ining, value in match["data"].items():
            wickets =0
            runs =0 
            for over in range (1,21):
                if over in value:
                    for ball_result in reversed(value[over]):
                        row ={}
                        runs = runs+ (ball_result["runs"])
                        ball = ball_result["ball"]
                        if ball_result["is_wicket"]:
                            wickets +=1
                        row = copy.deepcopy(ball_result)
                        row["psl_year"]=psl_year
                        row["match_number"] = index+1
                        row["over"]=over
                        row["inning"]=ining
                        row["team_1"] = match["team_1"]
                        row["team_2"] = match["team_2"]
                        row["result"] = match["result"]
                        row["total_runs"] = runs
                        row["wickets"]= wickets
                        all_rows.append(row)
    return all_rows

In [468]:
df.head(20)

Unnamed: 0,ball,runs,is_four,is_six,is_wicket,wicket,wicket_text,psl_year,match_number,over,inning,team_1,team_2,result,total_runs,wickets
0,1,0,False,False,False,,,2016,1,1,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
1,2,0,False,False,False,,,2016,1,1,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
2,3,0,False,False,False,,,2016,1,1,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
3,4,0,False,False,False,,,2016,1,1,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
4,5,0,False,False,False,,,2016,1,1,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
5,6,0,False,False,False,,,2016,1,1,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
6,1,0,False,False,False,,,2016,1,2,1,Islamabad United,Quetta Gladiators,Gladiators,0,0
7,2,1,False,False,False,,,2016,1,2,1,Islamabad United,Quetta Gladiators,Gladiators,1,0
8,3,2,False,False,False,,,2016,1,2,1,Islamabad United,Quetta Gladiators,Gladiators,3,0
9,4,0,False,False,False,,,2016,1,2,1,Islamabad United,Quetta Gladiators,Gladiators,3,0


In [475]:
df = pd.read_csv("./psl_formated.csv")

In [476]:
df.head()

Unnamed: 0,psl_year,match_number,team_1,team_2,inning,over,ball,runs,total_runs,wickets,is_four,is_six,is_wicket,wicket,wicket_text,result
0,2016,1,Islamabad United,Quetta Gladiators,1,1,1,0,0,0,False,False,False,,,Gladiators
1,2016,1,Islamabad United,Quetta Gladiators,1,1,2,0,0,0,False,False,False,,,Gladiators
2,2016,1,Islamabad United,Quetta Gladiators,1,1,3,0,0,0,False,False,False,,,Gladiators
3,2016,1,Islamabad United,Quetta Gladiators,1,1,4,0,0,0,False,False,False,,,Gladiators
4,2016,1,Islamabad United,Quetta Gladiators,1,1,5,0,0,0,False,False,False,,,Gladiators


In [474]:
df.shape

(33962, 16)