In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/ipl-complete-dataset-20082020/IPL Ball-by-Ball 2008-2020.csv")
df

# **Data findings**

This is ball by ball data from 2008 till 2020. Clear indications that this dataset is not sorted are as follows

    In the 1st row in the "over" column is equal to 6. Cricet overs start at 0
    In the last row in the "inning" column is equal to 1. IPL cricket matches have 2 innings each and the inning 2 is always last.

We need to attend to these before we can further


In [None]:
# Sort by id, then by inning, finally by over
df_sorted = df.sort_values(by = ["id", "inning", "over"])
df_sorted

# **Research questions**

*     Q1 : Who are the top 10 batsmen that has the best strike rate that scored more than 1,000 runs?
*     Q2 : How many batsmen have scored 100 runs or more against a single bowler?
*     Q3 : Which batsman has the highest boundary percentage per total runs in the powerplay overs?



# **Q1 : Who are the top 10 batsmen that has the best strike rate that scored more than 1,000 runs?**

* Create of copy of the dataset and name it "batsman_strike_rate_df"
* Remove all wides under the "extras_type" column as wides do not count towards the balls faced by a batsman. The balls faced by a batsman is important when calculating the strike rate.
* Find the sum of runs scored: Group the dataset by batsman and focus on the "batsman_runs" column with the sum()operation
* Find the number of balls faced: Group the dataset by batsman and focus on the "batsman_runs" column with the count() operation
* Merge the 2 datasets into one dataset
* Remove all other batsmen so dataset only contains batsmen that have scored a minimum on 1,000 runs
* Calculate the strike rate of the batsmen using the formula : batsman runs / balls faced * 100 - rounded to 2 decimal places
* Sort by top ten batsmen
* Create a scatter plot



In [None]:
# Create of copy of the dataset and name it "batsman_strike_rate_df"
batsman_strike_rate_df = df_sorted.copy()

In [None]:
# Remove all wides under the "extras_type" column as wides do not count towards the balls faced by a batsman
rows_before = batsman_strike_rate_df.shape[0]
batsman_strike_rate_df = batsman_strike_rate_df.loc[batsman_strike_rate_df["extras_type"] != "wides"]
rows_after = batsman_strike_rate_df.shape[0]

In [None]:
rows_before

In [None]:
rows_after

In [None]:
# Dropped rows
rows_before - rows_after

In [None]:
# Find the sum of runs scored: Group the dataset by batsman and focus on the "batsman_runs" column with the sum()operation
batsmen_runs = batsman_strike_rate_df.groupby("batsman").sum().sort_values(by = "batsman_runs", ascending = False)
batsmen_runs = batsmen_runs[["batsman_runs"]]
batsmen_runs

In [None]:
# Find the number of balls faced: Group the dataset by batsman and focus on the "batsman_runs" column with the count() operation
balls_faced = batsman_strike_rate_df.groupby("batsman").count().sort_values(by = "batsman_runs", ascending = False)
balls_faced = balls_faced[["batsman_runs"]]
balls_faced

In [None]:
# Merge the 2 datasets into one dataset
batsman_strike_rate_df = batsmen_runs
batsman_strike_rate_df["balls_faced"] = balls_faced["batsman_runs"]
batsman_strike_rate_df

In [None]:
# Remove all other batsmen so dataset only contains batsmen that have scored a minimum on 1,000 runs
batsman_strike_rate_df = batsman_strike_rate_df.loc[batsman_strike_rate_df["batsman_runs"] > 1000]
batsman_strike_rate_df

In [None]:
# Calculate the strike rate of the batsmen using the formula : batsman runs / balls faced * 100 - rounded to 2 decimal places
batsman_strike_rate_df["strike_rate"] = round(batsman_strike_rate_df["batsman_runs"] / 
                                              batsman_strike_rate_df["balls_faced"] * 100, 2)
batsman_strike_rate_df

In [None]:
# Sort by top ten batsmen
top_ten_batsmen = batsman_strike_rate_df.sort_values("strike_rate", ascending = False)[:10]
top_ten_batsmen

In [None]:
# Create a scatter plot
matplotlib.style.use("bmh")
ax = top_ten_batsmen.plot(figsize = (14,7), kind = "scatter", x = "batsman_runs", y = "strike_rate")

for i, txt in enumerate(top_ten_batsmen.index):
    ax.annotate(txt,(top_ten_batsmen.batsman_runs.iat[i],top_ten_batsmen.strike_rate.iat[i]))


# **Q2 : Which batsmen has scored more 100 runs or more against a single bowler?**

* Create of copy of the dataset and name it "batsman_v_bowler_df"
* Include only columns needed for Q2
* Create a new column named "bastman_v_bowler"
* Create a pivot table to find how much runs each batsman scored against each bowler
* Save by batsmen who has more than 100 runs

In [None]:
# Create of copy of the dataset and name it "batsman_v_bowler_df"
batsman_v_bowler_df = df_sorted.copy()

In [None]:
# Include only columns needed for Q2
batsman_v_bowler_df = batsman_v_bowler_df[["batsman", "bowler", "batsman_runs"]] 
batsman_v_bowler_df

In [None]:
# Create a new column named "bastman_v_bowler"
batsman_v_bowler_df["bastman_v_bowler"] = batsman_v_bowler_df["batsman"] + "_v_" +  batsman_v_bowler_df["bowler"]
batsman_v_bowler_df

In [None]:
# Create a pivot table to find how much runs each batsman scored against each bowler
# Thanks to www.geeksforgeeks.org for there blog https://www.geeksforgeeks.org/how-to-create-a-pivot-table-in-python-using-pandas/
pivot = batsman_v_bowler_df.pivot_table(index =['bastman_v_bowler'],
                       values =['batsman_runs'],
                       aggfunc ='sum')
pivot.sort_values("batsman_runs", ascending = False)

In [None]:
# Save by batsmen who has more than 100 runs
more_than_100 = pivot.sort_values("batsman_runs", ascending = False)
more_than_100[more_than_100["batsman_runs"] >= 100]

In [None]:
len(more_than_100[more_than_100["batsman_runs"] >= 100])

# **Q2 answer = 44 batsmen have scored 100 runs or more against a single bowler**

At the top of the list is Suresh Raina scoring 175 runs off Piyush Chawla.

# **Q3 : Which batsman has the highest boundary percentage per total runs in the powerplay overs, limited to 100 runs?¶**

* Create of copy of the dataset and name it "powerplay_df"
* Include only columns needed for Q3
* Only include data that contains the powerplay overs i.e. the first six overs of an innings
* Calculate total runs
* Calculate total runs in boundries
* Create new DataFrame and merge the runs DataFrame and the boundaries DataFrame
* Delete all batsmen that scored less than 100 runs
* Calculate boundary percentage of total runs
* Sort by top ten batsmen

In [None]:
# Create of copy of the dataset and name it "powerplay_df"
powerplay_df = df_sorted.copy()

In [None]:
# Include only columns needed for Q3
powerplay_df = powerplay_df[["inning", "over", "ball", "batsman", "bowler", "batsman_runs", "total_runs"]]
powerplay_df

In [None]:
# Only include data that contains the powerplay overs i.e. the first six overs of an innings
ppl_mask = powerplay_df["over"] <= 6
powerplay_df[ppl_mask]

In [None]:
# Calculate total runs
runs = powerplay_df[ppl_mask].groupby("batsman").sum()
runs

In [None]:
# Calculate total runs in boundries
boundaries_mask = (powerplay_df["batsman_runs"] ==4) | (powerplay_df["batsman_runs"] ==6)
boundaries = powerplay_df[ppl_mask & boundaries_mask].groupby("batsman").sum()
boundaries

In [None]:
# Create new DataFrame and merge the runs and boundaries DataFrames
ppl_df = pd.DataFrame(runs[["batsman_runs"]])
ppl_df = pd.merge(ppl_df, boundaries["batsman_runs"], on = "batsman")
ppl_df.columns = ["total_runs", "boundary_runs"]
ppl_df

In [None]:
# Delete all batsman that scored less than 100 runs
less_100_mask = ppl_df["total_runs"] > 100
less_than_100_ppl_df = ppl_df[less_100_mask]
less_than_100_ppl_df

In [None]:
# Calculate boundary percentage of total runs
less_than_100_ppl_df["boundary_%_of_total_runs"] = round(ppl_df["boundary_runs"] / ppl_df["total_runs"] *100, 2) 
less_than_100_ppl_df

In [None]:
# Sort by top ten batsmen
top_ten_ppl = less_than_100_ppl_df.sort_values("boundary_%_of_total_runs", ascending = False)[:10]
top_ten_ppl