## Imports

In [1]:
import glob
import pandas as pd

In [2]:
# importing multiple files in a directory, ignore files with _info
l = [pd.read_csv(filename) for filename in glob.glob("psl_csv2\*.csv") if "_info" not in filename]
df = pd.concat(l, axis = 0)

## Check for duplicates & errors

In [3]:
df.season.unique()

array(['2016/17', '2017/18', '2018/19', '2019/20', '2020/21', 2021,
       '2021/22', '2015/16', '2021'], dtype=object)

In [4]:
len(df)-len(df.drop_duplicates())

45381

## Correcting the errors

In [5]:
# convert seasons with slash to years
df = df.replace('2021/22', 2022)
df = df.replace('2021', 2021)
df = df.replace('2020/21', 2021)
df = df.replace('2019/20', 2020)
df = df.replace('2018/19', 2019)
df = df.replace('2017/18', 2018)
df = df.replace('2016/17', 2017)
df = df.replace('2015/16', 2016)

In [6]:
#check after converting strings to int
df.season.unique()

array([2017, 2018, 2019, 2020, 2021, 2022, 2016], dtype=int64)

In [7]:
# check after removing duplicates
df = df.drop_duplicates(keep = 'first')
len(df)-len(df.drop_duplicates())

0

## Sort by datetime

In [8]:
# sort by required order - date, matchid, innings, ball_number
df['start_date'] = pd.to_datetime(df['start_date'])
df = df.sort_values(['start_date', 'match_id', 'innings', 'ball'], ascending = True)

In [9]:
# reset index and export to csv
df.reset_index(inplace = True, drop  = True)
df.to_csv('PSL_ball_by_ball.csv', index = None)

In [10]:
# total file size
len(df)

50097

In [11]:
# sneak peak
df.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,959175,2016,2016-02-04,Dubai International Cricket Stadium,1,0.1,Islamabad United,Quetta Gladiators,SR Watson,Sharjeel Khan,...,0,,,,,,,,,
1,959175,2016,2016-02-04,Dubai International Cricket Stadium,1,0.2,Islamabad United,Quetta Gladiators,SR Watson,Sharjeel Khan,...,0,,,,,,,,,
2,959175,2016,2016-02-04,Dubai International Cricket Stadium,1,0.3,Islamabad United,Quetta Gladiators,SR Watson,Sharjeel Khan,...,0,,,,,,,,,
3,959175,2016,2016-02-04,Dubai International Cricket Stadium,1,0.4,Islamabad United,Quetta Gladiators,SR Watson,Sharjeel Khan,...,0,,,,,,,,,
4,959175,2016,2016-02-04,Dubai International Cricket Stadium,1,0.5,Islamabad United,Quetta Gladiators,SR Watson,Sharjeel Khan,...,0,,,,,,,,,
