# Root full match wise data from Cricinfo

This data consists of all the tests runs Root has scored over the course of his career until now. This will be the main dataset where I can add his new runs, and will be the main master database. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import numpy as np
import os

In [2]:
root_dir = Path.cwd().parent

In [3]:
root_dir

PosixPath('/Users/saral/Documents/cricket/cricket_gardens/Articles/Joe Root 16000 runs/will_root_get_to_16000')

In [4]:
df = pd.read_csv(root_dir / "data/stats/Joe_root_all.csv")


In [5]:
df.shape

(295, 14)

In [6]:
df.columns

Index(['Runs', 'Mins', 'BF', '4s', '6s', 'SR', 'Pos', 'Dismissal', 'Inns',
       'Unnamed: 9', 'Opposition', 'Ground', 'Start Date', 'Unnamed: 13'],
      dtype='object')

In [508]:
# country dict
ground_country = {
    'Nagpur': 'India',
    'Dunedin': 'New Zealand',
    'Wellington': 'New Zealand',
    'Auckland': 'New Zealand',
    "Lord's": 'England',
    'Leeds': 'England',
    'Nottingham': 'England',
    'Manchester': 'England',
    'Chester-le-Street': 'England',
    'The Oval': 'England',
    'Brisbane': 'Australia',
    'Adelaide': 'Australia',
    'W.A.C.A': 'Australia',
    'Melbourne': 'Australia',
    'Southampton': 'England',
    'North Sound': 'West Indies',
    "St George's": 'West Indies',
    'Bridgetown': 'West Indies',
    'Cardiff': 'England',
    'Birmingham': 'England',
    'Abu Dhabi': 'UAE',
    'Dubai (DICS)': 'UAE',
    'Sharjah': 'UAE',
    'Durban': 'South Africa',
    'Cape Town': 'South Africa',
    'Johannesburg': 'South Africa',
    'Centurion': 'South Africa',
    'Chattogram': 'Bangladesh',
    'Mirpur': 'Bangladesh',
    'Rajkot': 'India',
    'Visakhapatnam': 'India',
    'Mohali': 'India',
    'Wankhede': 'India',
    'Chennai': 'India',
    'Sydney': 'Australia',
    'Christchurch': 'New Zealand',
    'Galle': 'Sri Lanka',
    'Pallekele': 'Sri Lanka',
    'Colombo (SSC)': 'Sri Lanka',
    'Gros Islet': 'West Indies',
    'Mount Maunganui': 'New Zealand',
    'Hamilton': 'New Zealand',
    'Gqeberha': 'South Africa',
    'Ahmedabad': 'India',
    'Hobart': 'Australia',
    'Rawalpindi': 'Pakistan',
    'Multan': 'Pakistan',
    'Karachi': 'Pakistan',
    'Hyderabad': 'India',
    'Ranchi': 'India',
    'Dharamsala': 'India'
}



Here I am creating my own database and cleaning it for future use. Also then I can use this same functions to later on clean every other runs table for all the crikceters in cricinfo.com

Checklist of things to do - 
1. First check the runs column to see if any value is empty
1. Remove the * column from the runs such that it becomes an integer column (done)

In [512]:
class PlayerDataCleaner:
    def __init__(self, dataframe):
        self.df = dataframe
        self.checks = self.checks(self)
        self.clean = self.clean(self)

    # Checks 
    class checks:
        def __init__(self, outer):
            self.outer = outer

        def nan_in_runs(self):
            """
            Blank column in 'Runs' that might prevent conversion to integer.
            Check this before converting runs to integer. 
            """
            return self.outer.df[self.outer.df['Runs'].isna()]

    # Cleaning        
    class clean:
        def __init__(self, outer):
            self.outer = outer

        def remove_no_from_runs(self):
            df = self.outer.df
            df['Runs'] = (
                df['Runs'].astype(str)
                .str.replace("*", "", regex=False)
                .str.strip()
            )
            self.outer.df = df
            return df
        
        def convert_cols_to_ints(self, columns):
            """
            Convert specified columns to numeric (integers), coercing errors to NaN.
            columns : list of column names
            """
            df = self.outer.df
            for col in columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            self.outer.df = df
            return df
        
        def remove_dnb_innings(self):
            df = self.outer.df
            self.outer.df = df[~df['Dismissal'].isin(['DNB', '-'])]
            return self.outer.df
        
        def create_100_column(self):
            df = self.outer.df
            df['100s'] = np.where(df['Runs'] >= 100, 1, 0)
            self.outer.df = df
            return df

        def create_50_column(self):
            df = self.outer.df
            df['50s'] = np.where((df['Runs'] >= 50) & (df['Runs'] < 100), 1, 0)
            self.outer.df = df
            return df

        def start_date_todate(self):
            df = self.outer.df
            df['Start Date'] = pd.to_datetime(df['Start Date'], errors='coerce')
            self.outer.df = df
            return df

        def venue_country(self):
            df = self.outer.df
            df['Venue_Country'] = df['Ground'].map(ground_country)
            self.outer.df = df
            return df

        def remove_v_character(self):
            """
            Remove the leading 'v ' from the 'Opposition' column.
            """
            df = self.outer.df
            df['Opposition'] = df['Opposition'].astype(str).str.lstrip('v ').str.strip()
            self.outer.df = df
            return df

        def add_home_away(self, home_country):
            """
            Label each match as 'Home', 'Away', or 'Neutral' based on venue and opposition.
            """
            df = self.outer.df

            def classify(row):
                if row['Venue_Country'] == home_country:
                    return 'Home'
                elif row['Venue_Country'] == row['Opposition']:
                    return 'Away'
                else:
                    return 'Neutral'

            df['Home_Away'] = df.apply(classify, axis=1)
            self.outer.df = df
            return df
        
        def test_number(self, column_name):
            """
            Rename a column to 'Test number'.
            """
            df = self.outer.df
            df = df.rename(columns={column_name: 'Test number'})
            # remove the Test # from the values
            df['Test number'] = df['Test number'].astype(str).str.lstrip('Test #').str.strip()
            self.outer.df = df
            return df
                
        def add_not_out_column(self):
            """
            Create a 'NO' column: 1 if 'Dismissal' is 'Not out' or 'Retired not out', else 0.
            """
            df = self.outer.df
            df['NO'] = df['Dismissal'].apply(
                lambda x: 1 if str(x).strip().lower() in ['not out', 'retired notout'] else 0
            )
            self.outer.df = df
            return df
        # def strike_rate(self):
        #     df = self.outer.df
        #     df['SR'] = np.floor((df['Runs'] / df['BF']) * 100 * 100) / 100  # 2 decimals, rounded down
        #     self.outer.df = df
        #     return df











In [513]:
cleaner = PlayerDataCleaner(df)

In [514]:
# checks
df

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Unnamed: 9,Opposition,Ground,Start Date,Unnamed: 13
0,73,289,229,4,0,31.87,6,caught,1,,v India,Nagpur,13-Dec-12,Test # 2066
1,20*,64,56,1,1,35.71,6,not out,3,,v India,Nagpur,13-Dec-12,Test # 2066
2,4,17,11,1,0,36.36,6,caught,1,,v New Zealand,Dunedin,06-Mar-13,Test # 2077
3,0,4,2,0,0,0,7,run out,3,,v New Zealand,Dunedin,06-Mar-13,Test # 2077
4,10,24,20,1,0,50,6,caught,1,,v New Zealand,Wellington,14-Mar-13,Test # 2080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,104,331,199,10,0,52.26,4,bowled,1,,v India,Lord's,10-Jul-25,Test # 2594
291,40,140,96,1,0,41.66,4,bowled,3,,v India,Lord's,10-Jul-25,Test # 2594
292,150,349,248,14,0,60.48,4,stumped,2,,v India,Manchester,23-Jul-25,Test # 2596
293,29,59,45,6,0,64.44,4,lbw,2,,v India,The Oval,31-Jul-25,Test # 2598


In [520]:
# cleaning
df = cleaner.clean.remove_no_from_runs()
df = cleaner.clean.convert_cols_to_ints(['Runs','4s','6s','BF','SR'])
df = cleaner.clean.remove_dnb_innings()
df = cleaner.clean.create_100_column()
df = cleaner.clean.create_50_column()
df = cleaner.clean.start_date_todate()
df = cleaner.clean.venue_country()
df = cleaner.clean.remove_v_character()
df = cleaner.clean.add_home_away('England')
df = cleaner.clean.test_number('Unnamed: 13')
df = cleaner.clean.add_not_out_column()
# df = cleaner.clean.strike_rate()
print(df.shape)


(288, 19)


In [521]:
df['Home_Away'].value_counts()

Home_Away
Home       147
Away       135
Neutral      6
Name: count, dtype: int64

In [522]:
df

Unnamed: 0,Runs,Mins,BF,4s,6s,SR,Pos,Dismissal,Inns,Unnamed: 9,Opposition,Ground,Start Date,Test number,100s,50s,Venue_Country,Home_Away,NO
0,73.0,289,229.0,4.0,0.0,31.87,6,caught,1,,India,Nagpur,2012-12-13,2066,0,1,India,Away,0
1,20.0,64,56.0,1.0,1.0,35.71,6,not out,3,,India,Nagpur,2012-12-13,2066,0,0,India,Away,1
2,4.0,17,11.0,1.0,0.0,36.36,6,caught,1,,New Zealand,Dunedin,2013-03-06,2077,0,0,New Zealand,Away,0
3,0.0,4,2.0,0.0,0.0,0.00,7,run out,3,,New Zealand,Dunedin,2013-03-06,2077,0,0,New Zealand,Away,0
4,10.0,24,20.0,1.0,0.0,50.00,6,caught,1,,New Zealand,Wellington,2013-03-14,2080,0,0,New Zealand,Away,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,104.0,331,199.0,10.0,0.0,52.26,4,bowled,1,,India,Lord's,2025-07-10,2594,1,0,England,Home,0
291,40.0,140,96.0,1.0,0.0,41.66,4,bowled,3,,India,Lord's,2025-07-10,2594,0,0,England,Home,0
292,150.0,349,248.0,14.0,0.0,60.48,4,stumped,2,,India,Manchester,2025-07-23,2596,1,0,England,Home,0
293,29.0,59,45.0,6.0,0.0,64.44,4,lbw,2,,India,The Oval,2025-07-31,2598,0,0,England,Home,0


In [535]:
df.groupby('Opposition')\
.agg(Matches=('Test number', 'nunique'),\
    Innings=('Runs', 'count'),\
    Total_Runs=('Runs', 'sum'),\
    BF=('BF','sum'),\
    SR=('SR','first'),
    Highest = ('Runs','max'),\
    Not_out = ('NO','sum'),\
    Hundreds = ('100s','sum'),\
    Fifties = ('50s','sum'),
    Fours = ('4s','sum'),
    Sixes = ('6s','sum'))\
    .reset_index()\
    .assign(Avg=lambda x: np.floor(x['Total_Runs'] / (x['Innings'] - x['Not_out']) * 100) / 100)\
    .assign(SR=lambda x: np.floor((x['Total_Runs'] / x['BF'])*100*100)/100)















Unnamed: 0,Opposition,Matches,Innings,Total_Runs,BF,SR,Highest,Not_out,Hundreds,Fifties,Fours,Sixes,Avg
0,Australia,34,65,2428.0,4825.0,50.32,180.0,5,4,18,270.0,12.0,40.46
1,Bangladesh,2,4,98.0,177.0,55.36,56.0,0,0,1,9.0,0.0,24.5
2,India,35,64,3383.0,6115.0,55.32,218.0,7,13,12,360.0,8.0,59.35
3,Ireland,2,3,89.0,130.0,68.46,56.0,0,0,1,7.0,1.0,29.66
4,New Zealand,21,40,1925.0,3329.0,57.82,226.0,4,6,9,213.0,11.0,53.47
5,Pakistan,18,31,1487.0,2410.0,61.7,262.0,3,2,7,147.0,0.0,53.1
6,South Africa,15,27,1210.0,1923.0,62.92,190.0,1,2,9,155.0,4.0,46.53
7,Sri Lanka,13,24,1376.0,2145.0,64.14,228.0,2,6,2,125.0,3.0,62.54
8,West Indies,17,29,1513.0,2458.0,61.55,182.0,2,6,7,162.0,6.0,56.03
9,Zimbabwe,1,1,34.0,44.0,77.27,34.0,0,0,0,3.0,0.0,34.0


In [495]:
#Not outs vs Australia
df['Dismissal'].value_counts()

Dismissal
caught            164
lbw                51
bowled             39
not out            23
run out             8
stumped             2
retired notout      1
Name: count, dtype: int64

In [533]:
df.groupby('Venue_Country')\
.agg(Matches=('Test number', 'nunique'),\
    Innings=('Runs', 'count'),\
    Total_Runs=('Runs', 'sum'),\
    BF=('BF','sum'),\
    SR=('SR','first'),
    Highest = ('Runs','max'),\
    Not_out = ('NO','sum'),\
    Hundreds = ('100s','sum'),\
    Fifties = ('50s','sum'),
    Fours = ('4s','sum'),
    Sixes = ('6s','sum'))\
    .reset_index()\
    .assign(Avg=lambda x: np.floor(x['Total_Runs'] / (x['Innings'] - x['Not_out']) * 100) / 100)\
    .assign(SR=lambda x: np.floor((x['Total_Runs'] / x['BF'])*100*100)/100)

Unnamed: 0,Venue_Country,Matches,Innings,Total_Runs,BF,SR,Highest,Not_out,Hundreds,Fifties,Fours,Sixes,Avg
0,Australia,14,27,892.0,2015.0,44.26,89.0,2,0,9,91.0,0.0,35.68
1,Bangladesh,2,4,98.0,177.0,55.36,56.0,0,0,1,9.0,0.0,24.5
2,England,84,147,7329.0,12302.0,59.57,254.0,15,24,33,842.0,20.0,55.52
3,India,15,30,1272.0,2511.0,50.65,218.0,2,3,6,124.0,5.0,45.42
4,New Zealand,12,22,1006.0,1813.0,55.48,226.0,2,3,5,101.0,9.0,50.3
5,Pakistan,6,10,477.0,668.0,71.4,262.0,0,1,1,32.0,0.0,47.7
6,South Africa,8,15,703.0,1156.0,60.81,110.0,1,1,6,88.0,3.0,50.21
7,Sri Lanka,5,10,655.0,971.0,67.45,228.0,0,3,0,58.0,3.0,65.5
8,UAE,3,6,287.0,527.0,54.45,88.0,1,0,3,26.0,0.0,57.4
9,West Indies,9,17,824.0,1416.0,58.19,182.0,1,4,2,80.0,5.0,51.5


In [63]:
# Make sure Runs is a string first (handles numeric or mixed types)
df['Runs'] = df['Runs'].astype(str).str.replace(r"\*", "", regex=True).str.strip()
# Extract the first integer-like substring (handles values like '248*' or '248*†')
df['Runs'] = df['Runs'].str.extract(r'(-?\d+)')[0]
# Convert to numeric with nullable integer dtype so missing values are preserved as <NA>
df['Runs'] = pd.to_numeric(df['Runs'], errors='coerce').astype('Int64')


In [64]:
# Drop innings with DNB
df = df[df['Dismissal'] != '-']



In [65]:
df.shape

(288, 14)

In [92]:
# total innings
total_innings = len(df)
# total not out innings
total_not_out = len(df[df['Dismissal'] == 'not out'])
# total retired innings 
total_retired = len(df[df['Dismissal'] == 'retired notout'])
# total not out innings
total_notout_innings = total_not_out + total_retired
# total runs
total_runs = df['Runs'].sum()


In [93]:
total_retired

1

In [94]:
print(f"Joe Root has played {total_innings} innings,\
 with {total_notout_innings} not out innings and total run scored of {total_runs} runs.")

Joe Root has played 288 innings, with 24 not out innings and total run scored of 13543 runs.


In [95]:
career_average = total_runs / (total_innings - total_notout_innings)

In [97]:
print(f"Joe Root's career average is {round(career_average,3)}")

Joe Root's career average is 51.299
