In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# List of URLs to scrape
urls = [
    "https://www.espncricinfo.com/records/tournament/batting-most-runs-career/indian-premier-league-2024-15940",
    "https://www.espncricinfo.com/records/tournament/batting-most-runs-career/indian-premier-league-2023-15129",
    "https://www.espncricinfo.com/records/tournament/batting-most-runs-career/indian-premier-league-2022-14452",
    "https://www.espncricinfo.com/records/tournament/batting-most-runs-career/indian-premier-league-2021-13840"
]

# Function to scrape data from a URL and parse the table
def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Locate the table
    table = soup.select_one("table")  # Adjust the selector as needed
    if not table:
        return None  # Return None if no table is found

    rows = table.find_all("tr")
    table_data = []
    for row in rows:
        cols = row.find_all("td")
        cols = [col.get_text(strip=True) for col in cols]
        table_data.append(cols)

    return table_data

# Consolidate data from all URLs
all_data = []
for url in urls:
    table_data = scrape_data(url)
    if table_data:
        all_data.extend(table_data)

# Create a DataFrame and dynamically handle columns
df = pd.DataFrame(all_data)
df.columns = df.iloc[0]  # Use the first row as column headers
df = df[1:]  # Drop the header row from the data

# Save the data to a CSV file
#df.to_csv("ipl_most_runs.csv", index=False)

print("Data saved to ipl_most_runs.csv")


Data saved to ipl_most_runs.csv


In [3]:
df

Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s
1,V Kohli (RCB),2024-2024,15,15,3,741,113*,61.75,479,154.69,1,5,-,62,38
2,RD Gaikwad (CSK),2024-2024,14,14,3,583,108*,53.00,413,141.16,1,4,2,58,18
3,R Parag (RR),2024-2024,16,14,3,573,84*,52.09,384,149.21,-,4,-,40,33
4,TM Head (SRH),2024-2024,15,15,1,567,102,40.50,296,191.55,1,4,3,64,32
5,SV Samson (RR),2024-2024,16,15,4,531,86,48.27,346,153.46,-,5,1,48,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,KL Nagarkoti (KKR),2021-2021,1,1,-,-,-,0.00,2,0.00,-,-,1,0,0
600,Anuj Rawat (RR),2021-2021,2,1,-,-,-,0.00,1,0.00,-,-,1,0,0
601,K Yadav (RR),2021-2021,1,1,1,-,0*,0.00,4,0.00,-,-,-,0,0
602,JDS Neesham (MI),2021-2021,3,2,-,-,-,0.00,2,0.00,-,-,2,0,0


In [5]:
df['Span'] = df['Span'].str.split('-').str[0]

In [7]:
# Separate 'Player' and 'Team' into two columns
df[['Player Name', 'Team']] = df['Player'].str.extract(r'^(.*?)\s\((.*?)\)$')

# Drop the original 'Player' column if needed
df = df.drop(columns=['Player'])


In [9]:
df

Unnamed: 0,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Player Name,Team
1,2024,15,15,3,741,113*,61.75,479,154.69,1,5,-,62,38,V Kohli,RCB
2,2024,14,14,3,583,108*,53.00,413,141.16,1,4,2,58,18,RD Gaikwad,CSK
3,2024,16,14,3,573,84*,52.09,384,149.21,-,4,-,40,33,R Parag,RR
4,2024,15,15,1,567,102,40.50,296,191.55,1,4,3,64,32,TM Head,SRH
5,2024,16,15,4,531,86,48.27,346,153.46,-,5,1,48,24,SV Samson,RR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,2021,1,1,-,-,-,0.00,2,0.00,-,-,1,0,0,KL Nagarkoti,KKR
600,2021,2,1,-,-,-,0.00,1,0.00,-,-,1,0,0,Anuj Rawat,RR
601,2021,1,1,1,-,0*,0.00,4,0.00,-,-,-,0,0,K Yadav,RR
602,2021,3,2,-,-,-,0.00,2,0.00,-,-,2,0,0,JDS Neesham,MI


In [11]:
df['HS'] = df['HS'].str.replace('*', '', regex=False)

In [13]:
df

Unnamed: 0,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Player Name,Team
1,2024,15,15,3,741,113,61.75,479,154.69,1,5,-,62,38,V Kohli,RCB
2,2024,14,14,3,583,108,53.00,413,141.16,1,4,2,58,18,RD Gaikwad,CSK
3,2024,16,14,3,573,84,52.09,384,149.21,-,4,-,40,33,R Parag,RR
4,2024,15,15,1,567,102,40.50,296,191.55,1,4,3,64,32,TM Head,SRH
5,2024,16,15,4,531,86,48.27,346,153.46,-,5,1,48,24,SV Samson,RR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,2021,1,1,-,-,-,0.00,2,0.00,-,-,1,0,0,KL Nagarkoti,KKR
600,2021,2,1,-,-,-,0.00,1,0.00,-,-,1,0,0,Anuj Rawat,RR
601,2021,1,1,1,-,0,0.00,4,0.00,-,-,-,0,0,K Yadav,RR
602,2021,3,2,-,-,-,0.00,2,0.00,-,-,2,0,0,JDS Neesham,MI


In [15]:
df.dtypes

0
Span           object
Mat            object
Inns           object
NO             object
Runs           object
HS             object
Ave            object
BF             object
SR             object
100            object
50             object
0              object
4s             object
6s             object
Player Name    object
Team           object
dtype: object

In [17]:
df = df[["Player Name","Team","Span","Mat","Inns","NO","Runs","HS","Ave","SR","100","50","0","4s","6s"]]

In [19]:
df

Unnamed: 0,Player Name,Team,Span,Mat,Inns,NO,Runs,HS,Ave,SR,100,50,0,4s,6s
1,V Kohli,RCB,2024,15,15,3,741,113,61.75,154.69,1,5,-,62,38
2,RD Gaikwad,CSK,2024,14,14,3,583,108,53.00,141.16,1,4,2,58,18
3,R Parag,RR,2024,16,14,3,573,84,52.09,149.21,-,4,-,40,33
4,TM Head,SRH,2024,15,15,1,567,102,40.50,191.55,1,4,3,64,32
5,SV Samson,RR,2024,16,15,4,531,86,48.27,153.46,-,5,1,48,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,KL Nagarkoti,KKR,2021,1,1,-,-,-,0.00,0.00,-,-,1,0,0
600,Anuj Rawat,RR,2021,2,1,-,-,-,0.00,0.00,-,-,1,0,0
601,K Yadav,RR,2021,1,1,1,-,0,0.00,0.00,-,-,-,0,0
602,JDS Neesham,MI,2021,3,2,-,-,-,0.00,0.00,-,-,2,0,0


In [21]:
# Loop through columns excluding the first two (i.e., "Player" and "Team")
for col in df.columns[2:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')


In [23]:
df.dtypes

0
Player Name     object
Team            object
Span           float64
Mat            float64
Inns           float64
NO             float64
Runs           float64
HS             float64
Ave            float64
SR             float64
100            float64
50             float64
0              float64
4s             float64
6s             float64
dtype: object

In [25]:
# Loop through columns excluding the first two (i.e., "Player" and "Team")
for col in df.columns[2:8]:
    df[col] = df[col].fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [27]:
df.dtypes

0
Player Name     object
Team            object
Span             int32
Mat              int32
Inns             int32
NO               int32
Runs             int32
HS               int32
Ave            float64
SR             float64
100            float64
50             float64
0              float64
4s             float64
6s             float64
dtype: object

In [29]:
# Loop through columns excluding the first two (i.e., "Player" and "Team")
for col in df.columns[10:]:
    df[col] = df[col].fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [31]:
df.dtypes

0
Player Name     object
Team            object
Span             int32
Mat              int32
Inns             int32
NO               int32
Runs             int32
HS               int32
Ave            float64
SR             float64
100              int32
50               int32
0                int32
4s               int32
6s               int32
dtype: object

In [33]:
df

Unnamed: 0,Player Name,Team,Span,Mat,Inns,NO,Runs,HS,Ave,SR,100,50,0,4s,6s
1,V Kohli,RCB,2024,15,15,3,741,113,61.75,154.69,1,5,0,62,38
2,RD Gaikwad,CSK,2024,14,14,3,583,108,53.00,141.16,1,4,2,58,18
3,R Parag,RR,2024,16,14,3,573,84,52.09,149.21,0,4,0,40,33
4,TM Head,SRH,2024,15,15,1,567,102,40.50,191.55,1,4,3,64,32
5,SV Samson,RR,2024,16,15,4,531,86,48.27,153.46,0,5,1,48,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,KL Nagarkoti,KKR,2021,1,1,0,0,0,0.00,0.00,0,0,1,0,0
600,Anuj Rawat,RR,2021,2,1,0,0,0,0.00,0.00,0,0,1,0,0
601,K Yadav,RR,2021,1,1,1,0,0,0.00,0.00,0,0,0,0,0
602,JDS Neesham,MI,2021,3,2,0,0,0,0.00,0.00,0,0,2,0,0


In [35]:
df

Unnamed: 0,Player Name,Team,Span,Mat,Inns,NO,Runs,HS,Ave,SR,100,50,0,4s,6s
1,V Kohli,RCB,2024,15,15,3,741,113,61.75,154.69,1,5,0,62,38
2,RD Gaikwad,CSK,2024,14,14,3,583,108,53.00,141.16,1,4,2,58,18
3,R Parag,RR,2024,16,14,3,573,84,52.09,149.21,0,4,0,40,33
4,TM Head,SRH,2024,15,15,1,567,102,40.50,191.55,1,4,3,64,32
5,SV Samson,RR,2024,16,15,4,531,86,48.27,153.46,0,5,1,48,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,KL Nagarkoti,KKR,2021,1,1,0,0,0,0.00,0.00,0,0,1,0,0
600,Anuj Rawat,RR,2021,2,1,0,0,0,0.00,0.00,0,0,1,0,0
601,K Yadav,RR,2021,1,1,1,0,0,0.00,0.00,0,0,0,0,0
602,JDS Neesham,MI,2021,3,2,0,0,0,0.00,0.00,0,0,2,0,0


In [37]:
df = df.rename(columns={'Span': 'Year'})

In [39]:
df.dtypes

0
Player Name     object
Team            object
Year             int32
Mat              int32
Inns             int32
NO               int32
Runs             int32
HS               int32
Ave            float64
SR             float64
100              int32
50               int32
0                int32
4s               int32
6s               int32
dtype: object

In [41]:
df.to_csv("last_4years_batsmen.csv")