In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import warnings

In [None]:
plt.style.use("fivethirtyeight")
warnings.filterwarnings('ignore')

In [None]:
ball_by_ball = pd.read_csv("/home/venkyuser/ipl/IPL Ball-by-Ball 2008-2020.csv")

In [None]:
ball_by_ball.head()

In [None]:
ball_by_ball.tail()

In [None]:
ball_by_ball.info()

In [None]:
ball_by_ball.describe()

In [None]:
ball_by_ball.shape

In [None]:
print(f"This Dataset have {ball_by_ball.shape[0]} rows and {ball_by_ball.shape[1]} columns.")

In [None]:
# Checking Null Values
num_null = ball_by_ball.isnull().sum().sum()
print("No of Null Values: {}".format(num_null))

**PLots**

Plot missing values

In [None]:
plt.figure(figsize=(18, 18))

sns.displot(
data = ball_by_ball.isna().melt(value_name="Null Values"),
y = "variable",
hue = "Null Values",
multiple="fill",
aspect=3,
palette='PuBuGn_r' 
)
plt.title('Non-Missing Values in Ball by Ball data', weight = 'bold', size = 20, color = 'brown')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(size = 12, weight = 'bold', color = 'maroon')
plt.yticks(size = 12, weight = 'bold', color = 'maroon');

In [None]:
corelation = ball_by_ball.corr()

In [None]:
plt.figure(figsize=(18, 18))
sns.heatmap(corelation, xticklabels=corelation.columns, yticklabels=corelation.columns, annot=True)

In [None]:
sns.heatmap(corelation, xticklabels=corelation.columns, yticklabels=corelation.columns)

In [None]:
ball_by_ball.head()

**Most Runs Scored**

In [None]:
most_runs = ball_by_ball.groupby(['id'])['total_runs'].sum().reset_index()

In [None]:
most_runs.head()

In [None]:
# Ascending Order
asc_most_runs = most_runs.sort_values(by="total_runs", ascending=True)
asc_most_runs

In [None]:
# Descending Order
desc_most_runs = most_runs.sort_values(by="total_runs", ascending=False)
desc_most_runs

In [None]:
sns.pairplot(asc_most_runs)

**Matches**

In [None]:
matches = pd.read_csv("/home/venkyuser/ipl/IPL Matches 2008-2020.csv")

In [None]:
matches.head()

In [None]:
matches.tail()

In [None]:
# Adding Season Year in DF
matches['season'] = pd.DatetimeIndex(matches['date']).year

In [None]:
matches.head()

In [None]:
matches.isnull().sum()

In [None]:
num_matches_null = matches.isnull().sum().sum()
print("Matches have {} null values.".format(num_matches_null))

In [None]:
# as we see method column has most null values
cd = matches.dropna(subset=['method'])

In [None]:
cd.head()

In [None]:
cd.describe()

In [None]:
cd.info()

In [None]:
m_corr = cd.corr()
plt.figure(figsize=(18, 18))
sns.heatmap(m_corr, xticklabels=m_corr.columns, yticklabels=m_corr.columns, annot=True)

In [None]:
m_corr = cd.corr()
sns.heatmap(m_corr, xticklabels=m_corr.columns, yticklabels=m_corr.columns)

In [None]:
plt.figure(figsize=(17, 17))
sns.displot(
data=matches.isna().melt(value_name="Null Values"),
y="variable",
hue="Null Values",
palette="PuBuGn_r",
multiple="fill",
aspect=3
)
plt.title('Non-Missing Values in Matches data', weight = 'bold', size = 20, color = 'brown')
plt.xlabel(" ")
plt.ylabel(" ")
plt.xticks(size = 12, weight = 'bold', color = 'maroon')
plt.yticks(size = 12, weight = 'bold', color = 'maroon');

In [None]:
merged = desc_most_runs[['id', 'total_runs']].merge(matches, left_on='id', right_on='id', how='left')

In [None]:
merged.head()

In [None]:
# total runs in each season
season_most_runs = merged.groupby(['season'])['total_runs'].sum().reset_index()
season_most_runs

In [None]:
print("Most runs {} is in the season {}.".format(season_most_runs.max()['total_runs'], season_most_runs.max()['season']))

In [None]:
# Ascending
season_most_runs.sort_values(by='total_runs', ascending=True)

In [None]:
# Descending
season_most_runs.sort_values(by='total_runs', ascending=False)

In [None]:
sns.pairplot(season_most_runs)

In [None]:
sns.relplot(x='season', y='total_runs', data=season_most_runs, hue='season')

In [None]:
merged.head()

In [None]:
ball_by_ball.head()

In [None]:
# most runs scored by which batsman
most_runs_by_batsman = ball_by_ball.groupby('batsman')['total_runs'].sum().reset_index()
most_runs_by_batsman

In [None]:
sort_most_runs_by_batsman = most_runs_by_batsman.sort_values(by='total_runs', ascending=False)
sort_most_runs_by_batsman

In [None]:
max_runs = sort_most_runs_by_batsman.max()['total_runs']
max_player = sort_most_runs_by_batsman.max()['batsman']

In [None]:
print(f"Most runs of {max_runs} is by {max_player}.")

In [None]:
sort_most_runs_by_batsman[sort_most_runs_by_batsman['total_runs'] != 0]

In [None]:
# top 10
sort_most_runs_by_batsman[:10]

In [None]:
sort_most_runs_by_batsman.set_index('batsman', inplace=True)

In [None]:
top_batsman_20 = sort_most_runs_by_batsman[:20]

In [None]:
top_batsman_20.plot(kind="bar")

In [None]:
# We have seen that most runs scored by virat kholi.

In [None]:
matches.head()

In [None]:
matches.toss_winner

In [None]:
matches.winner

In [None]:
winner_toss = matches.toss_winner == matches.winner

In [None]:
winner_toss.value_counts()

In [None]:
winner_toss.value_counts().plot(kind="pie")

In [None]:
import warnings

In [None]:
winner_losers = {
    "Matches wins (Toss Wins)": 418,
    "Matches Loss (Toss Loss)": 398
}
warnings.filterwarnings("ignore")
winners_losers = pd.Series(winner_losers)
plt.figure(figsize=(9, 9))
plt.pie(x=winners_losers, labels=winners_losers.index, autopct="%.2f%%")

In [None]:
# so from pie chart er can see that is a team wins the toss most of the times it wins the match

In [None]:
plt.figure(figsize=(7, 10))
sns.countplot(y="winner", data=matches, palette="copper")
plt.title("Most Matches won by Teams", fontsize=12)
plt.xlabel("No of Matches")
plt.ylabel("Teams")

In [None]:
# as you can see that most matches won by KKR.

In [None]:
matches.head()

**Palettes Colors:**


Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'crest', 'crest_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'flare', 'flare_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'icefire', 'icefire_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'mako', 'mako_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'rocket', 'rocket_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'vlag', 'vlag_r', 'winter', 'winter_r'

In [None]:
plt.figure(figsize=(7, 10))
sns.countplot(y="season", data=matches, palette="YlGnBu")
plt.title("Most matches played in season", fontsize=12)
plt.xlabel("Matches")
plt.ylabel("Season")

In [None]:
# as you can see that more matches played in season 2013

In [None]:
matches.head()

In [None]:
plt.figure(figsize=(6, 11))
sns.countplot(y="city", data=matches, palette="Greens_r")
plt.title("Most matches played in city", fontsize=12)
plt.xlabel("City")
plt.ylabel("Matches")

In [None]:
# as you can see that most matches played in Mumbai

In [None]:
matches.head()

In [None]:
man_of_match_players = matches['player_of_match'].value_counts()
man_of_match_players[:10].plot(kind="bar")

In [None]:
# as you can see that AB is most time man of match

In [None]:
ball_by_ball.head()

In [None]:
most_ball_bowled_by_bowler = ball_by_ball['bowler'].value_counts()
most_ball_bowled_by_bowler[:20].plot(kind="bar")

In [None]:
# as you can see that most balls bowled by Harbhajan SIngh

EDA By Muhammad Hanan Asghar