## Imports

In [1]:
import pandas as pd
import numpy as np
import glob
import io

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

## Matches.csv

In [2]:
# list of all matches
matches_list = [filename for filename in glob.glob("ipl_csv\*.csv")]

# total matches
matches_count = len(matches_list)

In [3]:
all_col_names = []

# match id, cols count, cols
raw_df = pd.DataFrame(index=np.arange(matches_count), columns=['m_id', 'cols_count', 'cols'])

for i in range(matches_count):
    raw_df['m_id'][i] = matches_list[i].split('\\')[1].split('.')[0]
    
    # 22, 21, 20, 19, 18
    match_path = matches_list[i]
    try:
        temp_df = pd.read_csv(match_path, nrows = 21, skiprows=1, header=None)
    except:
        try:
            temp_df = pd.read_csv(match_path, nrows = 20, skiprows=1, header=None)
        except:
            try:
                temp_df = pd.read_csv(match_path, nrows = 19, skiprows=1, header=None)
            except:
                try:
                    temp_df = pd.read_csv(match_path, nrows = 18, skiprows=1, header=None)
                except:
                    try:
                        temp_df = pd.read_csv(match_path, nrows = 17, skiprows=1, header=None)
                    except:
                        print(f"Error: {matches_list[i]}")
                        
    # transformation to get all the columns without losing duplicates
    temp_df.columns = ['col0', 'col1', 'col2']
    mask = temp_df['col1'].duplicated(keep=False)
    temp_df.loc[mask, 'col1'] += temp_df.groupby('col1').cumcount().add(1).astype(str)
    cols_list = list(temp_df.col1)
    
    # add column count
    raw_df['cols_count'][i] = len(cols_list)
    
    # capture all column names
    all_col_names.append(cols_list)
    
    # convert list to string
    raw_df['cols'][i] = str(cols_list)

flat_list = [item for sublist in all_col_names for item in sublist]
all_unique_col_names = list(set(flat_list))

# an empty dataframe with all unique col names
df = pd.DataFrame(columns=all_unique_col_names)

In [4]:
for i in range(matches_count):
    # 22, 21, 20, 19, 18
    match_path = matches_list[i]
    try:
        temp_df = pd.read_csv(match_path, nrows = 21, skiprows=1, header=None)
    except:
        try:
            temp_df = pd.read_csv(match_path, nrows = 20, skiprows=1, header=None)
        except:
            try:
                temp_df = pd.read_csv(match_path, nrows = 19, skiprows=1, header=None)
            except:
                try:
                    temp_df = pd.read_csv(match_path, nrows = 18, skiprows=1, header=None)
                except:
                    try:
                        temp_df = pd.read_csv(match_path, nrows = 17, skiprows=1, header=None)
                    except:
                        print(f"Error: {matches_list[i]}")
                        
    # transformation to get all the columns without losing duplicates
    temp_df.columns = ['col0', 'col1', 'col2']
    mask = temp_df['col1'].duplicated(keep=False)
    temp_df.loc[mask, 'col1'] += temp_df.groupby('col1').cumcount().add(1).astype(str)
    
    tempdf_keys = list(temp_df.col1)
    tempdf_values = list(temp_df.col2)                    
    
    temp_dict = dict(zip(tempdf_keys, tempdf_values))
    temp_df = pd.DataFrame([temp_dict])
    
    df = pd.concat([df, temp_df], axis = 0)

In [5]:
df.head()

Unnamed: 0,outcome,event,date,gender,reserve_umpire,team1,toss_winner,umpire2,match_referee,season,team2,winner_runs,balls_per_over,player_of_match,date1,neutralvenue,venue,method,toss_decision,city,tv_umpire,date2,winner,umpire1,match_number,eliminator,winner_wickets
0,,Indian Premier League,2017/04/05,male,N Pandit,Sunrisers Hyderabad,Royal Challengers Bangalore,NJ Llong,J Srinath,2017,Royal Challengers Bangalore,35.0,6,Yuvraj Singh,,,"Rajiv Gandhi International Stadium, Uppal",,field,Hyderabad,A Deshmukh,,Sunrisers Hyderabad,AY Dandekar,1,,
0,,Indian Premier League,2017/04/06,male,Navdeep Singh,Rising Pune Supergiant,Rising Pune Supergiant,S Ravi,M Nayyar,2017,Mumbai Indians,,6,SPD Smith,,,Maharashtra Cricket Association Stadium,,field,Pune,VK Sharma,,Rising Pune Supergiant,A Nand Kishore,2,,7.0
0,,Indian Premier League,2017/04/07,male,K Srinivasan,Gujarat Lions,Kolkata Knight Riders,CK Nandan,V Narayan Kutty,2017,Kolkata Knight Riders,,6,CA Lynn,,,Saurashtra Cricket Association Stadium,,field,Rajkot,YC Barde,,Kolkata Knight Riders,Nitin Menon,3,,10.0
0,,Indian Premier League,2017/04/08,male,R Pandit,Kings XI Punjab,Kings XI Punjab,C Shamshuddin,Chinmay Sharma,2017,Rising Pune Supergiant,,6,GJ Maxwell,,,Holkar Cricket Stadium,,field,Indore,KN Ananthapadmanabhan,,Kings XI Punjab,AK Chaudhary,4,,6.0
0,,Indian Premier League,2017/04/08,male,Navdeep Singh,Royal Challengers Bangalore,Royal Challengers Bangalore,VK Sharma,J Srinath,2017,Delhi Daredevils,15.0,6,KM Jadhav,,,M.Chinnaswamy Stadium,,bat,Bengaluru,A Nand Kishore,,Royal Challengers Bangalore,S Ravi,5,,


In [6]:
# if date is missing -> add date from date1
df['date'] = df.apply(lambda x: x['date1'] if pd.isnull(x['date']) else x['date'], axis = 1)

In [7]:
# format date from string to datetime object
df['date'] = df['date'].astype('datetime64[ns]')

In [8]:
# matchId list
matchId_list = list(raw_df.m_id)

In [9]:
df['matchId'] = matchId_list

In [10]:
df.head()

Unnamed: 0,outcome,event,date,gender,reserve_umpire,team1,toss_winner,umpire2,match_referee,season,team2,winner_runs,balls_per_over,player_of_match,date1,neutralvenue,venue,method,toss_decision,city,tv_umpire,date2,winner,umpire1,match_number,eliminator,winner_wickets,matchId
0,,Indian Premier League,2017-04-05,male,N Pandit,Sunrisers Hyderabad,Royal Challengers Bangalore,NJ Llong,J Srinath,2017,Royal Challengers Bangalore,35.0,6,Yuvraj Singh,,,"Rajiv Gandhi International Stadium, Uppal",,field,Hyderabad,A Deshmukh,,Sunrisers Hyderabad,AY Dandekar,1,,,1082591
0,,Indian Premier League,2017-04-06,male,Navdeep Singh,Rising Pune Supergiant,Rising Pune Supergiant,S Ravi,M Nayyar,2017,Mumbai Indians,,6,SPD Smith,,,Maharashtra Cricket Association Stadium,,field,Pune,VK Sharma,,Rising Pune Supergiant,A Nand Kishore,2,,7.0,1082592
0,,Indian Premier League,2017-04-07,male,K Srinivasan,Gujarat Lions,Kolkata Knight Riders,CK Nandan,V Narayan Kutty,2017,Kolkata Knight Riders,,6,CA Lynn,,,Saurashtra Cricket Association Stadium,,field,Rajkot,YC Barde,,Kolkata Knight Riders,Nitin Menon,3,,10.0,1082593
0,,Indian Premier League,2017-04-08,male,R Pandit,Kings XI Punjab,Kings XI Punjab,C Shamshuddin,Chinmay Sharma,2017,Rising Pune Supergiant,,6,GJ Maxwell,,,Holkar Cricket Stadium,,field,Indore,KN Ananthapadmanabhan,,Kings XI Punjab,AK Chaudhary,4,,6.0,1082594
0,,Indian Premier League,2017-04-08,male,Navdeep Singh,Royal Challengers Bangalore,Royal Challengers Bangalore,VK Sharma,J Srinath,2017,Delhi Daredevils,15.0,6,KM Jadhav,,,M.Chinnaswamy Stadium,,bat,Bengaluru,A Nand Kishore,,Royal Challengers Bangalore,S Ravi,5,,,1082595


In [11]:
# export file
df.to_csv('matches_updated_mens_ipl.csv', index = False)

## Deliveries.csv

In [12]:
def populate_bowling_team(match_id, batting_team, matches_df):
    
    team1 = matches_df[matches_df.matchId == match_id]['team1'].item()
    team2 = matches_df[matches_df.matchId == match_id]['team2'].item()
    
    if batting_team == team1:
        return team2
    else:
        return team1

In [14]:
# use any of the following column names
deliveries_cols = ['ball', 'inning', 'over_ball', 'batting_team', 'batsman', 'non_striker', 'bowler', 'batsman_runs',
               'extras', 'isWide', 'isNoBall', 'Byes', 'LegByes', 'Penalty', 'dismissal_kind', 'player_dismissed']

all_deliveries_df = pd.DataFrame(columns=deliveries_cols)

for i in range(matches_count):
    # fetch match id
    match_id = matches_list[i].split('\\')[1].split('.')[0]
    
    # 22, 21, 20, 19, 18
    match_path = matches_list[i]
    
    # iterate through each match
    temp_df = pd.read_csv(match_path, usecols=['version'])
    temp_df['info_ball'] = temp_df.index
    no_of_rows_to_skip = temp_df.info_ball.value_counts()['info'] + 1
    
    try:
        deliveries_temp_df = pd.read_csv(match_path, skiprows=no_of_rows_to_skip, header=None)
    except:
        print(f"Error: {matches_list[i]}")
        
    deliveries_temp_df.columns = deliveries_cols
    deliveries_temp_df['matchId'] = match_id
    all_deliveries_df = pd.concat([all_deliveries_df, deliveries_temp_df], axis = 0)

In [15]:
# populate over_no & ball_no
all_deliveries_df['over'] = all_deliveries_df['over_ball'].apply(lambda x: str(x).split('.')[0])
all_deliveries_df['ball'] = all_deliveries_df['over_ball'].apply(lambda x: str(x).split('.')[1])

# fixing the order of all columns
all_deliveries_df =all_deliveries_df[['matchId', 'inning', 'over_ball', 'over', 'ball', 'batting_team', 'batsman', 'non_striker',
       'bowler', 'batsman_runs', 'extras', 'isWide', 'isNoBall', 'Byes',
       'LegByes', 'Penalty', 'dismissal_kind', 'player_dismissed']]

In [16]:
# create a df with only matchId, batting_team, bowling_team
new_df = all_deliveries_df[['matchId', 'batting_team']]

In [17]:
# remove duplicates
new_df = new_df.drop_duplicates()

In [18]:
# populate bowling team using the above function
new_df['bowling_team'] = new_df.apply(lambda x: populate_bowling_team(x['matchId'], x['batting_team'], df), axis = 1)

In [19]:
new_df.head()

Unnamed: 0,matchId,batting_team,bowling_team
0,1082591,Sunrisers Hyderabad,Royal Challengers Bangalore
125,1082591,Royal Challengers Bangalore,Sunrisers Hyderabad
0,1082592,Mumbai Indians,Rising Pune Supergiant
125,1082592,Rising Pune Supergiant,Mumbai Indians
0,1082593,Gujarat Lions,Kolkata Knight Riders


In [20]:
# merge new_df only bowling team
all_deliveries_df = all_deliveries_df.merge(new_df, on = ['matchId','batting_team'], how = 'left')

# rearrage column names
all_deliveries_df = all_deliveries_df[['matchId', 'inning', 'over_ball', 'over', 'ball', 
                                       'batting_team', 'bowling_team' , 
                                       'batsman', 'non_striker', 'bowler', 
                                       'batsman_runs', 'extras', 'isWide',
                                       'isNoBall', 'Byes', 'LegByes', 'Penalty', 'dismissal_kind',
                                       'player_dismissed']]

In [21]:
all_deliveries_df.head()

Unnamed: 0,matchId,inning,over_ball,over,ball,batting_team,bowling_team,batsman,non_striker,bowler,batsman_runs,extras,isWide,isNoBall,Byes,LegByes,Penalty,dismissal_kind,player_dismissed
0,1082591,1,0.1,0,1,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,0,,,,,,,
1,1082591,1,0.2,0,2,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,0,,,,,,,
2,1082591,1,0.3,0,3,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,4,0,,,,,,,
3,1082591,1,0.4,0,4,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,0,,,,,,,
4,1082591,1,0.5,0,5,Sunrisers Hyderabad,Royal Challengers Bangalore,DA Warner,S Dhawan,TS Mills,0,2,2.0,,,,,,


In [22]:
# export file
all_deliveries_df.to_csv('deliveries_updated_mens_ipl.csv', index = False)