## Downloading steps
- url: https://cricsheet.org/downloads/
- Go to Test matches -> click on New (All section)
- Save the zip file in the same folder of this notebook
- Extract the zip file keeping the same name

## Imports

In [8]:
import pandas as pd
import glob

## Check for any errors

In [9]:
ls1 = glob.glob("tests_male_csv2\*.csv")
ls2 = []

for i in ls1:
    # ignore info files
    if "_info" not in i:
        try:
            ls2.append(pd.read_csv(i))
        except:
            print('error', i)

In [10]:
len(ls2)

690

## Combine Match by Match CSV files

In [11]:
#importing multiple files in a directory
error_files = []
l = [pd.read_csv(filename) if "_info" not in filename else error_files.append(filename)  for filename in glob.glob("tests_male_csv2\*.csv")]
df = pd.concat(l, axis = 0)

## Sneak peak

In [12]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.1,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,,,,
1,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.2,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,,,,
2,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.3,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,,,,
3,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.4,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,caught,SC Cook,,
4,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.5,South Africa,Australia,HM Amla,D Elgar,...,0,,,,,,,,,


In [13]:
df.season.unique()

array(['2016/17', 2016, 2017, '2017/18', 2018, '2018/19', 2019, '2019/20',
       2020, '2020/21', 2021, '2021/22', 2005, '2005/06', 2006, '2006/07',
       2007, '2007/08', 2008, '2008/09', 2009, '2009/10', 2010, '2010/11',
       2011, '2011/12', 2012, '2012/13', 2013, '2013/14', '2003/04', 2004,
       '2004/05', 2014, '2014/15', 2015, '2015/16'], dtype=object)

In [14]:
df.batting_team.unique()

array(['South Africa', 'Australia', 'Pakistan', 'Bangladesh',
       'New Zealand', 'India', 'West Indies', 'Zimbabwe', 'England',
       'Sri Lanka', 'Ireland', 'Afghanistan', 'ICC World XI'], dtype=object)

In [15]:
#matches by country
len(df[df.batting_team == 'India'].match_id.unique())

174

## Check for duplicates

In [16]:
len(df)-len(df.drop_duplicates())

1

## Correcting duplicates

In [17]:
df = df.drop_duplicates(keep = 'first')

In [18]:
len(df)-len(df.drop_duplicates())

0

## Sort by dates

In [19]:
df['start_date'] = pd.to_datetime(df['start_date'])

In [20]:
df = df.sort_values('start_date', ascending = True)

## Check after sorting

In [21]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
2151,64071,2003/04,2004-03-08,Galle International Stadium,4,0.6,Sri Lanka,Australia,MS Atapattu,KC Sangakkara,...,0,,,,,,,,,
1609,64071,2003/04,2004-03-08,Galle International Stadium,3,62.6,Australia,Sri Lanka,DR Martyn,ML Hayden,...,0,,,,,,,,,
1610,64071,2003/04,2004-03-08,Galle International Stadium,3,63.1,Australia,Sri Lanka,ML Hayden,DR Martyn,...,0,,,,,,,,,
1611,64071,2003/04,2004-03-08,Galle International Stadium,3,63.2,Australia,Sri Lanka,DR Martyn,ML Hayden,...,0,,,,,,,,,
1612,64071,2003/04,2004-03-08,Galle International Stadium,3,63.3,Australia,Sri Lanka,DR Martyn,ML Hayden,...,0,,,,,,,,,


In [22]:
df.tail()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
543,1277080,2021/22,2022-01-03,"The Wanderers Stadium, Johannesburg",2,26.2,South Africa,India,KD Petersen,D Elgar,...,0,,,,,,,,,
542,1277080,2021/22,2022-01-03,"The Wanderers Stadium, Johannesburg",2,26.1,South Africa,India,KD Petersen,D Elgar,...,0,,,,,,,,,
541,1277080,2021/22,2022-01-03,"The Wanderers Stadium, Johannesburg",2,25.6,South Africa,India,D Elgar,KD Petersen,...,0,,,,,,,,,
551,1277080,2021/22,2022-01-03,"The Wanderers Stadium, Johannesburg",2,27.4,South Africa,India,KD Petersen,D Elgar,...,0,,,,,,,,,
1648,1277080,2021/22,2022-01-03,"The Wanderers Stadium, Johannesburg",4,67.3,South Africa,India,D Elgar,T Bavuma,...,0,,,,,,,,,


In [23]:
len(df)

1356657

## Export to single CSV

In [None]:
df.to_csv('TESTS_ball_by_ball.csv', index = False)