## Downloading steps
- url: https://cricsheet.org/downloads/
- Go to Test matches -> click on New (All section)
- Save the zip file in the same folder of this notebook
- Extract the zip file keeping the same name

## Imports

In [1]:
import pandas as pd
import glob

## Check for any errors

In [2]:
ls1 = glob.glob("odis_male_csv2\*.csv")
ls2 = []

for i in ls1:
    # ignore info files
    if "_info" not in i:
        try:
            ls2.append(pd.read_csv(i))
        except:
            print('error', i)

In [3]:
len(ls2)

1959

## Combine Match by Match CSV files

In [4]:
#importing multiple files in a directory
error_files = []
l = [pd.read_csv(filename) if "_info" not in filename else error_files.append(filename)  for filename in glob.glob("odis_male_csv2\*.csv")]
df = pd.concat(l, axis = 0)

## Sneak peak

In [5]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1000887,2016/17,2017-01-13,"Brisbane Cricket Ground, Woolloongabba",1,0.1,Australia,Pakistan,DA Warner,TM Head,...,0,,,,,,,,,
1,1000887,2016/17,2017-01-13,"Brisbane Cricket Ground, Woolloongabba",1,0.2,Australia,Pakistan,DA Warner,TM Head,...,0,,,,,,,,,
2,1000887,2016/17,2017-01-13,"Brisbane Cricket Ground, Woolloongabba",1,0.3,Australia,Pakistan,DA Warner,TM Head,...,0,,,,,,,,,
3,1000887,2016/17,2017-01-13,"Brisbane Cricket Ground, Woolloongabba",1,0.4,Australia,Pakistan,DA Warner,TM Head,...,0,,,,,,,,,
4,1000887,2016/17,2017-01-13,"Brisbane Cricket Ground, Woolloongabba",1,0.5,Australia,Pakistan,DA Warner,TM Head,...,1,1.0,,,,,,,,


In [6]:
df.season.unique()

array(['2016/17', 2016, 2017, '2017/18', 2018, '2018/19', 2019, '2019/20',
       2020, '2020/21', 2021, '2021/22', 2005, '2005/06', 2006, '2006/07',
       2007, '2007/08', 2008, '2008/09', 2009, '2009/10', 2010, '2010/11',
       2011, '2011/12', 2012, '2012/13', 2013, '2013/14', '2003/04', 2004,
       '2004/05', '2014/15', 2014, 2015, '2015/16'], dtype=object)

In [7]:
df.batting_team.unique()

array(['Australia', 'Pakistan', 'New Zealand', 'Afghanistan', 'Scotland',
       'Hong Kong', 'Zimbabwe', 'India', 'Bangladesh', 'South Africa',
       'England', 'Sri Lanka', 'Papua New Guinea', 'West Indies',
       'Ireland', 'United Arab Emirates', 'Nepal',
       'United States of America', 'Namibia', 'Oman', 'Netherlands',
       'Africa XI', 'Asia XI', 'ICC World XI', 'Bermuda', 'Canada', 'Kenya'], dtype=object)

In [8]:
#matches by country
len(df[df.batting_team == 'India'].match_id.unique())

421

## Check for duplicates

In [9]:
len(df)-len(df.drop_duplicates())

9

## Correcting duplicates

In [10]:
df = df.drop_duplicates(keep = 'first')

In [11]:
len(df)-len(df.drop_duplicates())

0

## Sort by dates

In [12]:
df['start_date'] = pd.to_datetime(df['start_date'])

In [13]:
df = df.sort_values('start_date', ascending = True)

## Check after sorting

In [14]:
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
279,64857,2003/04,2004-01-03,Eden Park,1,46.1,Pakistan,New Zealand,Abdul Razzaq,Moin Khan,...,0,,,,,,,,,
29,64857,2003/04,2004-01-03,Eden Park,1,4.6,Pakistan,New Zealand,Yasir Hameed,Imran Farhat,...,0,,,,,,,,,
30,64857,2003/04,2004-01-03,Eden Park,1,5.1,Pakistan,New Zealand,Imran Farhat,Yasir Hameed,...,0,,,,,,,,,
31,64857,2003/04,2004-01-03,Eden Park,1,5.2,Pakistan,New Zealand,Imran Farhat,Yasir Hameed,...,0,,,,,,,,,
32,64857,2003/04,2004-01-03,Eden Park,1,5.3,Pakistan,New Zealand,Imran Farhat,Yasir Hameed,...,0,,,,,,,,,


In [15]:
df.tail()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
192,1295183,2021/22,2022-01-25,"West End Park International Cricket Stadium, Doha",1,31.1,Afghanistan,Netherlands,Rahmat Shah,Hashmatullah Shahidi,...,0,,,,,,,,,
193,1295183,2021/22,2022-01-25,"West End Park International Cricket Stadium, Doha",1,31.2,Afghanistan,Netherlands,Hashmatullah Shahidi,Rahmat Shah,...,0,,,,,,,,,
194,1295183,2021/22,2022-01-25,"West End Park International Cricket Stadium, Doha",1,31.3,Afghanistan,Netherlands,Rahmat Shah,Hashmatullah Shahidi,...,0,,,,,,,,,
188,1295183,2021/22,2022-01-25,"West End Park International Cricket Stadium, Doha",1,30.3,Afghanistan,Netherlands,Hashmatullah Shahidi,Rahmat Shah,...,0,,,,,,,,,
572,1295183,2021/22,2022-01-25,"West End Park International Cricket Stadium, Doha",2,42.4,Netherlands,Afghanistan,VJ Kingma,BD Glover,...,0,,,,,,lbw,VJ Kingma,,


In [16]:
len(df)

1043681

## Export to single CSV

In [17]:
df.to_csv('ODIs_ball_by_ball.csv', index = False)