## Dependencies

In [None]:
from multiprocessing.pool import ThreadPool
import pandas as pd
import yaml
import glob
from datetime import datetime

## Function Name - Write_To_CSV

This function writes the output dataframes to a CSV file.

In [None]:
def Write_To_CSV(Delivery_DF, Match_DF, path):
    Delivery_DF = Delivery_DF.drop(columns=['Match_ID'])
    Delivery_DF.index.name='Match_ID'
    Match_DF.index.name='Match_ID'
    Delivery_DF.to_csv(path+'Delivery_Info.csv')
    Match_DF.to_csv(path+'Match_Info.csv')

## Function Name - Yaml_Reader

This function finds all the YAML file at the input directory path. Then reads the file one at a time and parse the data into a pandas data frame. After parsing the datas from the YAML file into a pandas dataframe, it returns the dataframe.

In [None]:
def Yaml_Reader(dir_path):
    pool = ThreadPool(processes=1)
    CSV_Data_Deliveries = pd.DataFrame()
    CSV_Match_Info = pd.DataFrame()
    
    path_list = glob.glob(dir_path+"*.yaml")
    ID = 1
    for path in path_list:
        with open(path, 'r') as file_object:
            try:
                print (ID,'------>',path)
                data = yaml.safe_load(file_object)
                t1 = pool.apply_async(Data_Extractor, (data, ID))
                t2 = pool.apply_async(Match_Info, (data, ID))
                CSV_Data_Deliveries = pd.concat([CSV_Data_Deliveries, t1.get()])
                CSV_Match_Info = pd.concat([CSV_Match_Info, t2.get()])
                ID = ID + 1
            except yaml.YAMLError as exc:
                print (exc)
                
    return CSV_Data_Deliveries, CSV_Match_Info

## Function Name - Match_Info

This function takes three inputs and gives a pandas dataframe as output. A dctionary containing the data, a dataframe that will contain the extracted data and ID.

In [None]:
def Match_Info(data, ID):
    DataFrame = pd.DataFrame()
    info = data['info']
    date_data = info['dates'][0] if type(info['dates'][0]) != str else datetime.strptime(info['dates'][0], '%Y-%m-%d').date()
    season = date_data.year
    date = date_data.strftime("%m/%d/%y")
    team1 = info['teams'][0]
    team2 = info['teams'][1]
    toss_winner = info['toss']['winner']
    toss_decision = info['toss']['decision']
    result = info['outcome']['result'] if 'result' in info['outcome'] else 'Normal'
    dl_applied = 1 if 'method' in info['outcome'] else 0
    winning_data = info['outcome']['by'] if 'by' in info['outcome'] else info['outcome']
    winner = info['outcome']['winner'] if 'winner' in info['outcome'] else None
    win_by_runs = winning_data['runs'] if 'runs' in winning_data else None
    win_by_wicket = winning_data['wickets'] if 'wickets' in winning_data else None
    man_of_the_match = info['player_of_match'] if 'player_of_match' in info else None
    venue = info['venue'] 
    umpires = info['umpires']
    umpire1 = umpires[0] if len(umpires) > 0 else None
    umpire2 = umpires[1] if len(umpires) > 0 else None
    umpire3 = umpires[2] if len(umpires) == 3 else None
    
    Match_Data = {'Season' : season,
                  'Date' : date,
                  'Team1' : team1,
                  'Team2' : team2,
                  'Toss_Winner' : toss_winner,
                  'Toss_Decision' : toss_decision, 
                  'Result' : result,
                  'DL_Method' : dl_applied,
                  'Winner' : winner,
                  'Win_By_Runs' : win_by_runs,
                  'Win_By_Wicket' : win_by_wicket,
                  'Man_Of_The_Match' : man_of_the_match,
                  'Venue' : venue,
                  'Umpire1' : umpire1,
                  'Umpire2' : umpire2,
                  'Umpire3' : umpire3,
                 }
    
    row = pd.DataFrame(Match_Data, index=[ID])
    DataFrame = pd.concat([DataFrame,row])
    
    return DataFrame

## Function Name - Data_Extractor

This function takes data a dictionary, a pandas dataframe and ID. Then parse the data. 

In [None]:
def Data_Extractor(data, ID):
    DataFrame = pd.DataFrame()
    number_of_innings = len(data['innings'])
    super_over = 1 if len(data['innings']) == 4 else 0
    teams = np.array(data['info']['teams'])

    for i in range(0, number_of_innings):
        innings_number = [*data['innings'][i]][0]
        batting_team = data['innings'][i][innings_number]['team']
        bowling_team = teams[teams!=batting_team]
        innings_data = data['innings'][i][innings_number]['deliveries']
        for j in range(0,len(innings_data)):
            over_data = [*innings_data[j]][0]
            over = int(over_data)+1
            ball = int(str(over_data-int(over_data))[2])
            batsman = innings_data[j][over_data]['batsman']
            non_striker = innings_data[j][over_data]['non_striker']
            bowler = innings_data[j][over_data]['bowler']
            runs = innings_data[j][over_data]['runs']
            batsman_run = runs['batsman']
            extra_runs = runs['extras']
            total_runs = runs['total']

            if 'extras' in innings_data[j][over_data]:
                extras = innings_data[j][over_data]['extras']
                wides = extras['wides'] if 'wides' in extras else 0
                bye_runs = extras['bye_runs'] if 'bye_runs' in extras else 0
                legbye_runs = extras['legbye_runs'] if 'legbye_runs' in extras else 0
                noball_runs = extras['noball_runs'] if 'noball_runs' in extras else 0
                penalty_runs = extras['penalty_runs'] if 'penalty_runs' in extras else 0
            else:
                wides = 0
                bye_runs = 0
                legbye_runs = 0
                noball_runs = 0
                penalty_runs = 0

            if 'wicket' in innings_data[j][over_data]:
                wicket_data = innings_data[j][over_data]['wicket']
                player_dismissed = wicket_data['player_out']
                dismissal_type = wicket_data['kind']
                if 'fielders' in wicket_data:   # If there are more than one filder involved then select the 1st filder
                    fielder = wicket_data['fielders']
                    fielder = fielder[0] if len(fielder) > 1 else fielder
                else:
                    fielder = None
                
            else:
                player_dismissed = None
                dismissal_type = None
                fielder = None

            ball_by_ball_data = {'Innings' : i+1,
                                 'Batting_Team' : batting_team,
                                 'Bowling_Team' : bowling_team,
                                 'Over' : over,
                                 'Ball' : ball,
                                 'Batsman' : batsman,
                                 'Non_Striker' : non_striker,
                                 'Bowler' : bowler,
                                 'Is_Super_Over' : super_over,
                                 'Wide_Runs' : wides,
                                 'Bye_Runs' : bye_runs,
                                 'Legbye_Runs' : legbye_runs,
                                 'Noball_Runs' : noball_runs,
                                 'Penalty_Runs' : penalty_runs,
                                 'Batsman_Runs' : batsman_run,
                                 'Extras' : extra_runs,
                                 'Total_Runs' : total_runs,
                                 'Player_Dismissed' : player_dismissed,
                                 'Dismissal_Type' : dismissal_type,
                                 'Fielder' : fielder
                                }
            row = pd.DataFrame(ball_by_ball_data,index=[ID])
            DataFrame = pd.concat([DataFrame,row])
    
    return DataFrame
        

In [1]:
CSV_Delivery_Info, CSV_Match_Info = Yaml_Reader('/Users/swayam/Documents/Projects/ipl/') # Give the path of the directory 
                                                                                #containing YAML files
Write_To_CSV(CSV_Delivery_Info, CSV_Match_Info, '/Users/swayam/Documents/Projects/Data_Mining/Data') # Give path of the directory where
                                                                                 # you want to save the output CSV

NameError: name 'Yaml_Reader' is not defined

In [None]:
CSV_Delivery_Info

In [None]:
CSV_Delivery_Info.describe()

In [None]:
CSV_Match_Info.describe()