## Downloading steps
- url: https://cricsheet.org/downloads/
- Go to Test matches -> click on New (All section)
- Save the zip file in the same folder of this notebook
- Extract the zip file keeping the same name

## Imports

In [1]:
import pandas as pd
import glob

## Check for any errors

In [2]:
ls1 = glob.glob("tests_csv2\*.csv")
ls2 = []

for i in ls1:
    # ignore info files
    if "_info" not in i:
        try:
            ls2.append(pd.read_csv(i))
        except:
            print('error', i)

## Combine Match by Match CSV files

In [3]:
#importing multiple files in a directory
error_files = []
l = [pd.read_csv(filename) if "_info" not in filename else error_files.append(filename)  for filename in glob.glob("tests_csv2\*.csv")]
df = pd.concat(l, axis = 0)

## Sneak peak

In [4]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.1,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,,,,
1,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.2,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,,,,
2,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.3,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,,,,
3,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.4,South Africa,Australia,SC Cook,D Elgar,...,0,,,,,,caught,SC Cook,,
4,1000851,2016/17,2016-11-03,Western Australia Cricket Association Ground,1,0.5,South Africa,Australia,HM Amla,D Elgar,...,0,,,,,,,,,


In [5]:
df.season.unique()

array(['2016/17', 2016, 2017, '2017/18', 2018, '2018/19', 2019, '2019/20',
       2020, '2020/21', 2021, 2005, '2005/06', 2006, '2006/07', 2007,
       '2007/08', 2008, '2008/09', 2009, '2009/10', 2010, '2010/11', 2011,
       '2011/12', 2012, '2012/13', 2013, '2013/14', '2003/04', 2004,
       '2004/05', 2014, '2014/15', 2015, '2015/16'], dtype=object)

In [6]:
df.batting_team.unique()

array(['South Africa', 'Australia', 'Pakistan', 'Bangladesh',
       'New Zealand', 'India', 'West Indies', 'Zimbabwe', 'England',
       'Sri Lanka', 'Ireland', 'Afghanistan', 'ICC World XI'],
      dtype=object)

In [7]:
#matches by country
len(df[df.batting_team == 'India'].match_id.unique())

171

## Check for duplicates

In [8]:
len(df)-len(df.drop_duplicates())

1

## Correcting duplicates

In [9]:
df = df.drop_duplicates(keep = 'first')

In [10]:
len(df)-len(df.drop_duplicates())

0

## Sort by dates

In [11]:
df['start_date'] = pd.to_datetime(df['start_date'])

In [12]:
df = df.sort_values('start_date', ascending = True)

## Check after sorting

In [13]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
2411,64071,2003/04,2004-03-08,Galle International Stadium,4,43.6,Sri Lanka,Australia,UDU Chandana,WPUJC Vaas,...,0,,,,,,caught,UDU Chandana,,
827,64071,2003/04,2004-03-08,Galle International Stadium,2,69.3,Sri Lanka,Australia,TM Dilshan,HP Tillakaratne,...,0,,,,,,,,,
826,64071,2003/04,2004-03-08,Galle International Stadium,2,69.2,Sri Lanka,Australia,TM Dilshan,HP Tillakaratne,...,0,,,,,,,,,
825,64071,2003/04,2004-03-08,Galle International Stadium,2,69.1,Sri Lanka,Australia,TM Dilshan,HP Tillakaratne,...,0,,,,,,,,,
824,64071,2003/04,2004-03-08,Galle International Stadium,2,68.6,Sri Lanka,Australia,HP Tillakaratne,TM Dilshan,...,0,,,,,,,,,


In [14]:
df.tail()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
546,1239545,2021,2021-08-25,"Headingley, Leeds",2,48.6,England,India,H Hameed,RJ Burns,...,0,,,,,,,,,
545,1239545,2021,2021-08-25,"Headingley, Leeds",2,48.5,England,India,H Hameed,RJ Burns,...,0,,,,,,,,,
544,1239545,2021,2021-08-25,"Headingley, Leeds",2,48.4,England,India,RJ Burns,H Hameed,...,0,,,,,,,,,
554,1239545,2021,2021-08-25,"Headingley, Leeds",2,50.2,England,India,H Hameed,DJ Malan,...,0,,,,,,,,,
1653,1239545,2021,2021-08-25,"Headingley, Leeds",3,99.2,India,England,Mohammed Siraj,JJ Bumrah,...,0,,,,,,,,,


In [15]:
len(df)

1343973

## Export to single CSV

In [16]:
df.to_csv('TESTS_ball_by_ball.csv', index = False)