In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm import tqdm

import numpy as np
from fuzzywuzzy import fuzz, process

from collections import defaultdict

# This notebook will be used to 
## 1) Scrape cricinfo for a given years tournament schedule (eg: IPL 2021 schedule)
## 2) Scrape squad details for a given season (eg: IPL 2021 squads list)
## 3) Creating a team_player mapping table and dumping the csv

In [None]:
# Config
clean_data_path = os.path.join("..", "clean_data") 
tournament_name = "IPL"

In [None]:
df_team = pd.read_csv(os.path.join(clean_data_path, "team.csv"))
df_team = df_team.loc[:, ~df_team.columns.str.contains('^Unnamed')]
team_id_map = dict(zip(df_team.team_name, df_team.team_id))

In [None]:
df_player = pd.read_csv(os.path.join(clean_data_path, "player.csv"))
df_player = df_player.loc[:, ~df_player.columns.str.contains('^Unnamed')]
player_dispname_id_map = dict(zip(df_player.player_display_name, df_player.player_id))
player_name_id_map = dict(zip(df_player.player_name, df_player.player_id))
player_fullname_id_map = dict(zip(df_player.player_full_name, df_player.player_id))

In [None]:
player_name_id_map["Dan Christian"] = player_name_id_map["Daniel Christian"]

### Utility functions.

In [None]:
def get_from_obj(obj, key):
    try:
        return obj[key]
    except Exception as e:
        return ""

In [None]:
schedule = {}

URL = 'https://www.espncricinfo.com/series/ipl-2021-1249214/match-schedule-fixtures'
page = requests.get(URL)
page_content = BeautifulSoup(page.content, 'html.parser')
matches = page_content.find_all('div', class_='match-info match-info-FIXTURES')
match_no = 1
for match in tqdm(matches):
    status = match.find('div', class_='status')
    time = status.find('span').text
    venue = match.find('div', class_='description').text.split(",")[1]
    teams = match.find_all('p', class_='name')
    schedule[match_no] = {
        "match_no" : match_no,
        "time" : time, 
        "venue" : venue.replace(' ', ''),
        "team_1" : team_id_map[process.extractOne(teams[0].text, team_id_map.keys())[0]],
        "team_2" : team_id_map[process.extractOne(teams[1].text, team_id_map.keys())[0]],
        "match_display_name" : f"{teams[0].text} vs {teams[1].text}"
    }
    match_no += 1
df_ipl_schedule_csv = pd.DataFrame.from_dict(schedule, "index")
df_ipl_schedule_csv.to_csv(os.path.join(clean_data_path, "schedule.csv"), index=False)

In [None]:
team_urls = {
    "csk" : "https://www.espncricinfo.com/ci/content/squad/1252150.html",
    "dc" : "https://www.espncricinfo.com/ci/content/squad/1252198.html",
    "kkr" : "https://www.espncricinfo.com/ci/content/squad/1252188.html",
    "mi" : "https://www.espncricinfo.com/ci/content/squad/1252149.html",
    "pk" : "https://www.espncricinfo.com/ci/content/squad/1252194.html",
    "rr" : "https://www.espncricinfo.com/ci/content/squad/1252201.html",
    "rcb" : "https://www.espncricinfo.com/ci/content/squad/1252176.html",
    "srh" : "https://www.espncricinfo.com/ci/content/squad/1252199.html"
}

In [None]:
ipl_squads_csv = {}
player_id = 1;
for team in tqdm((team_urls), position=0, leave=True):
    team_url = team_urls[team]
    team_page = requests.get(team_url)
    team_page_content = BeautifulSoup(team_page.content, 'html.parser')
    main = team_page_content.find_all('div', class_='squads_main')
    team_name = main[0].find_all('h1')[0].text.split(" / ")[0][:-6]
    players = team_page_content.find_all('div', class_='large-13')
    for player in players:
        player_header = player.find('a')
        espn_player_id = player_header['href'].split("/")[-1][:-5]
        player_name = player_header.text.strip()
        player_display_name = "" # This needs to be updated later with fuzzy matching
        
        player_type = player.find('h3').find('span')
        if player_type and player_type.text == 'coach':
            continue;
        
        player_url = 'https://www.espncricinfo.com/ci/content/player/' + espn_player_id +'.html'
        player_page = requests.get(player_url)
        player_page_content = BeautifulSoup(player_page.content, 'html.parser')
        player_infos = player_page_content.find('div', class_='player_overview-grid').find_all('div')
        player_info_map = {}
        for info in player_infos:
            player_info_map[info.find('p').text] = info.find('h5').text
        ipl_squads_csv[player_id] = {
            "player_id" : player_id,
            "team_name" : team_name,
            "player_name" : player_name,
            "player_display_name" : player_display_name,
            "player_full_name" : player_info_map["Full Name"],
            "batting_style" : get_from_obj(player_info_map, "Batting Style"),
            "bowling_style" : get_from_obj(player_info_map, "Bowling Style"),
            "playling_role" : get_from_obj(player_info_map, "Playing Role")
        }
        player_id += 1
df_ipl_squads_csv = pd.DataFrame.from_dict(ipl_squads_csv, "index")
#df_ipl_squads_csv.to_csv(os.path.join(clean_data_path, "ipl_squads.csv"), index=False)

## Adding the players from ipl 2021 squads to our main table
### After all players are mapped there should be no print statements in the exception section

In [None]:
team_player_ipl = defaultdict(list)
not_mapped = 0
for index, row in df_ipl_squads_csv.iterrows():
    try:
        player_id = player_name_id_map[row["player_name"]]
        team_player_ipl[team_id_map[row['team_name']]].append(player_id)
    except Exception as e:
        #print(f"{row['player_name']} not mapped")
        player_name = row['player_name']
        player_full_name = row['player_full_name']
        batting_style = row['batting_style']
        bowling_style = row['bowling_style']
        country_id = '2'
        team_id = str(team_id_map[row['team_name']])
        print(f'add_player("", "{player_name}", "{player_full_name}", "{batting_style}", "{bowling_style}", "", "{country_id}", "{team_id}")')
        not_mapped += 1
        continue

In [None]:
team_squad_ipl = {}
i=0
for team in team_player_ipl:
    team_squad_ipl[i] = {
        "team_id" : str(team),
        "2007" : "",
        "2008" : "",
        "2009" : "",
        "2010" : "",
        "2011" : "",
        "2012" : "",
        "2013" : "",
        "2014" : "",
        "2015" : "",
        "2016" : "",
        "2017" : "",
        "2018" : "",
        "2019" : "",
        "2020" : "",
        "2021" : ",".join(str(player_id) for player_id in team_player_ipl[team]),
    }
    i += 1
df_team_squad_ipl = pd.DataFrame.from_dict(team_squad_ipl, "index")
df_team_squad_ipl.to_csv(os.path.join(clean_data_path, "squad.csv"), index=False)