## Step 1. Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
import glob 

## Step 2. Combining all csv files into one except "all_matches.csv"

- glob find files according to given pattern or name.
- glob.glob("My_Folder/*.txt") -> will find all files inside "My_Folder" which ends with ".txt".

In [None]:
df = pd.read_csv("all_matches.csv")
df.head(2)

In [None]:
final_df= df.loc[df['ball'] < 6.0]
final_df.head(5)

## Step 3. Data Preprocessing

#### Step 3.(a). Checking data types of each colum

In [None]:
final_df.dtypes

#### Step 3.(b). Checking if there is any Null (NaN) values

In [None]:
final_df.isnull().sum()

#### Step 3.(c). Checking team name and replacing their names with their latest names(only if both names of team is present in dataset).

- **Rising Pune Supergiant** is same as **Rising Pune Supergiants**.
- **Deccan Chargers** has new name as **Sunrisers Hyderabad**.
- **Delhi Daredevils** has new name as **Delhi Capitals**.

In [None]:
final_df['batting_team'].unique()

In [None]:
import warnings
warnings.filterwarnings('ignore')

final_df.replace("Rising Pune Supergiant","Rising Pune Supergiants", inplace=True)
final_df.replace('Deccan Chargers', 'Sunrisers Hyderabad', inplace=True)
final_df.replace('Delhi Daredevils', 'Delhi Capitals', inplace=True)

#### Step 3.(d). Adding a column **'tot_score_on_that_ball'** that contains the value of total run scored on that ball (actual run + extras(if any))

In [None]:
import warnings
warnings.filterwarnings('ignore')

final_df['tot_score_on_that_ball'] = final_df['runs_off_bat'] + final_df['extras']
ids = [0,2] 
final_df['wickets'] = if final_df['player_dismissed'].isnull = false df.loc[df.a.isin(ids), 'wickets'] += 1
final_df.head(3)

#### Step 3.(e). Finding 6 over score of all teams in all matches

In [None]:
def mergeDf(df_tuple):

    bat_team_matches_df, bat_team_6_over_score_df, bat_team_6_over_extras_df = df_tuple
    sub_df = bat_team_matches_df[['match_id','venue','batting_team','bowling_team','innings']]
    bat_team_complete_df = bat_team_6_over_score_df.merge(bat_team_6_over_extras_df)
    bat_team_complete_df = bat_team_complete_df.merge(sub_df)

    return bat_team_complete_df # this is complete dataframe of bat_team


In [None]:
def creatingDf(batting_team_name):

	# matches in which bat_team scores
	bat_team_matches_df = final_df.loc[(final_df['batting_team'] == batting_team_name)] 

	# total score of bat_team in individual matches 
	bat_team_6_over_score_df = bat_team_matches_df.groupby('match_id')['tot_score_on_that_ball'].sum().to_frame(name='tot_score_in_6_over').reset_index()

	# total extra runs of bat_team in individual matches (wides,noballs etc.)
	bat_team_6_over_extras_df = bat_team_matches_df.groupby('match_id')['extras'].sum().to_frame(name='tot_extras_in_6_over').reset_index()

	# deleting details of ball-by-ball data of each match and only storing the overall 6-over score per match
	bat_team_matches_df.drop_duplicates(subset =["match_id"],inplace = True)

	return mergeDf((bat_team_matches_df,bat_team_6_over_score_df,bat_team_6_over_extras_df))

# tot_score_in_6_over = run_by_bat + extras

In [None]:
csk_complete_df = creatingDf('Chennai Super Kings')
kkr_complete_df = creatingDf('Kolkata Knight Riders')
rcb_complete_df = creatingDf('Royal Challengers Bangalore')
mi_complete_df = creatingDf('Mumbai Indians')
rr_complete_df = creatingDf('Rajasthan Royals')
dc_complete_df = creatingDf('Delhi Capitals')
sh_complete_df = creatingDf('Sunrisers Hyderabad')
gl_complete_df = creatingDf('Gujarat Lions')
kxp_complete_df = creatingDf('Kings XI Punjab')
pw_complete_df = creatingDf('Pune Warriors')
ktk_complete_df = creatingDf('Kochi Tuskers Kerala')
rps_complete_df = creatingDf('Rising Pune Supergiants')

In [None]:
mi_complete_df.head(4)

#### Step 3.(f). Finding strikers in 6 six overs for each batting team

In [None]:
# creating a dictionary "batsmen" such that
# keys = match_id and values = name of batsmen in first 6 overs for all matches
# similarly, creating a dictionary "bowlers" such that
# keys = match_id and values = name of bowlers in first 6 overs for all matches

batsmen = dict()
bowlers = dict()
for i,j in final_df[['match_id','striker','bowler']].iterrows():
    m_id = j['match_id']
    pla_name = j['striker']
    bow_name = j['bowler']
    # for finding batsmen
    if m_id in batsmen:
        batsmen[m_id].add(pla_name)
    else:
        batsmen[m_id] = {pla_name}

    # for finding bowlers
    if m_id in bowlers:
        bowlers[m_id].add(bow_name)
    else:
        bowlers[m_id] = {bow_name}

In [None]:
# creating a dataframe of batsmen and bowlers so that we can merge it with each individual team dataframe

matches_id = []
batsmen_name = []
bowlers_name = []

for m_id,bat_name in batsmen.items():
    matches_id.append(m_id)
    batsmen_name.append(bat_name)

for m_id,bow_name in bowlers.items():
    bowlers_name.append(bow_name)

striker_batsman_in_6_overs_df = pd.DataFrame(data=(matches_id,batsmen_name),index=("match_id","striker_batsmen_name"))
striker_batsman_in_6_overs_df = striker_batsman_in_6_overs_df.transpose()

bowler_in_6_overs_df = pd.DataFrame(data=(matches_id,bowlers_name),index=("match_id","bowler_name"))
bowler_in_6_overs_df = bowler_in_6_overs_df.transpose()    

# merging batsmen and bowlers dataframe together
bats_bowl_df_in_6_overs = striker_batsman_in_6_overs_df.merge(bowler_in_6_overs_df, on="match_id")

In [None]:
# adding batsmen and bowlers name in the dataframe of each team

csk_complete_df = csk_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
kkr_complete_df = kkr_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
rcb_complete_df = rcb_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
mi_complete_df = mi_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
rr_complete_df = rr_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
dc_complete_df = dc_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
sh_complete_df = sh_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
gl_complete_df = gl_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
kxp_complete_df = kxp_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
pw_complete_df = pw_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
ktk_complete_df = ktk_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')
rps_complete_df = rps_complete_df.merge(bats_bowl_df_in_6_overs, on="match_id", how = 'inner')

In [None]:
# seeing an example that how our final dataset look like
rcb_complete_df.head(3)

## Step 4. Training the Model