# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!wget https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar -xvzf spark-3.3.0-bin-hadoop3.tgz
!pip install findspark

import os
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
config = SparkConf().setAppName("IPL Analysis")
sc = SparkContext(conf=config).getOrCreate()

from IPython.display import clear_output 
clear_output()

print(sc.appName ,"is Running..")

IPL Analysis is Running..


# Data Exploration

In [2]:
from operator import add

In [3]:
r1 = sc.textFile("/content/drive/MyDrive/Colab Notebooks/SparkData/IPL Matches.csv")

In [4]:
r1.first() # Column names

'id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2'

In [5]:
r2 = r1.subtract(sc.parallelize(r1.take(1))) # Removing Header row

In [6]:
r2.count()

816

In [7]:
r2.take(10)

['335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9,N,NA,Aleem Dar,GA Pratapkumar',
 '335998,Delhi,2008-04-30,GD McGrath,Feroz Shah Kotla,0,Delhi Daredevils,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Delhi Daredevils,runs,10,N,NA,Aleem Dar,I Shivram',
 '336005,Jaipur,2008-05-04,Sohail Tanvir,Sawai Mansingh Stadium,0,Rajasthan Royals,Chennai Super Kings,Chennai Super Kings,bat,Rajasthan Royals,wickets,8,N,NA,Asad Rauf,AV Jayaprakash',
 '336008,Mumbai,2008-05-07,A Nehra,Dr DY Patil Sports Academy,0,Mumbai Indians,Rajasthan Royals,Mumbai Indians,field,Mumbai Indians,wickets,7,N,NA,DJ Harper,RE Koertzen',
 '336017,Kolkata,2008-05-13,Shoaib Akhtar,Eden Gardens,0,Kolkata Knight Riders,Delhi Daredevils,Kolkata Knight Riders,bat,Kolkata Knight Riders,runs,23,N,NA,Asad Rauf,IL Howell',
 '336024,Hyderabad,2008-05-18,DJ Bravo,"Rajiv Gandhi International Stadium, Uppal",0,Deccan Chargers,Mumba

# Counting the number of matches played in each city.

In [8]:
r3=r2.map(lambda row:row.split(",")[1])
r3=r3.map(lambda city:(city,1))
r3=r3.reduceByKey(add)
r3=r3.sortBy(lambda x: x[1], ascending = False)

r3.collect()

[('Mumbai', 101),
 ('Kolkata', 77),
 ('Delhi', 74),
 ('Bangalore', 65),
 ('Hyderabad', 64),
 ('Chennai', 57),
 ('Chandigarh', 56),
 ('Jaipur', 47),
 ('Pune', 38),
 ('Abu Dhabi', 29),
 ('Dubai', 26),
 ('Bengaluru', 15),
 ('Durban', 15),
 ('Visakhapatnam', 13),
 ('NA', 13),
 ('Centurion', 12),
 ('Sharjah', 12),
 ('Ahmedabad', 12),
 ('Rajkot', 10),
 ('Dharamsala', 9),
 ('Indore', 9),
 ('Johannesburg', 8),
 ('Port Elizabeth', 7),
 ('Cuttack', 7),
 ('Cape Town', 7),
 ('Ranchi', 7),
 ('Raipur', 6),
 ('Kochi', 5),
 ('Kanpur', 4),
 ('Nagpur', 3),
 ('Kimberley', 3),
 ('East London', 3),
 ('Bloemfontein', 2)]

# Counting the number of matches won by each team in 2014.

In [13]:
r4=r2.filter(lambda row: '2014' in row.split(",")[2])
r4=r4.filter(lambda row:row.split(",")[10] not in ['field','bat'])
r4=r4.map(lambda row:row.split(",")[10])
r4=r4.map(lambda team:(team,1))
r4=r4.reduceByKey(add)
r4=r4.sortBy(lambda x: x[1], ascending = False)

r4.collect()

[('Chennai Super Kings', 10),
 ('Kolkata Knight Riders', 10),
 ('Kings XI Punjab', 9),
 ('Royal Challengers Bangalore', 5),
 ('Rajasthan Royals', 5),
 ('Sunrisers Hyderabad', 4),
 ('Mumbai Indians', 4),
 ('Delhi Daredevils', 2)]

# Finding the Player who won the most MOM award.

In [10]:
r5=r2.map(lambda row:row.split(",")[3])
r5=r5.map(lambda city:(city,1))
r5=r5.reduceByKey(add)
r5=r5.sortBy(lambda x: x[1], ascending = False)

r5.first()

('AB de Villiers', 23)

# Find the top 10 umpires who was present in most of the matches.

In [11]:
r61=r2.map(lambda row:row.split(",")[15])
r62=r2.map(lambda row:row.split(",")[16])
r61=r61.filter(lambda umpire:umpire!='NA')
r62=r62.filter(lambda umpire:umpire!='NA')
r61=r61.map(lambda umpire:(umpire,1))
r62=r62.map(lambda umpire:(umpire,1))
r6=r61.union(r62)
r6=r6.reduceByKey(add)
r6=r6.sortBy(lambda x: x[1], ascending = False)

r6.take(10)

[('S Ravi', 105),
 ('HDPK Dharmasena', 90),
 ('AK Chaudhary', 74),
 ('C Shamshuddin', 73),
 ('M Erasmus', 59),
 ('Nitin Menon', 52),
 ('Asad Rauf', 51),
 ('CB Gaffaney', 45),
 ('CK Nandan', 41),
 ('BR Doctrove', 41)]

# Count of teams who won the Match after winning the Toss.

In [12]:
r7=r2.filter(lambda row: row.split(",")[8]==row.split(",")[10])
r7=r7.map(lambda row:row.split(",")[8])
r7=r7.map(lambda team:(team,1))
r7=r7.reduceByKey(add)
r7=r7.sortBy(lambda x: x[1], ascending = False)

r7.collect()

[('Mumbai Indians', 54),
 ('Kolkata Knight Riders', 51),
 ('Royal Challengers Bangalore', 39),
 ('Chennai Super Kings', 37),
 ('Rajasthan Royals', 36),
 ('Delhi Daredevils', 31),
 ('Sunrisers Hyderabad', 19),
 ('Kings XI Punjab', 18),
 ('Deccan Chargers', 17),
 ('Delhi Capitals', 10),
 ('Gujarat Lions', 8),
 ('Rising Pune Supergiant', 5),
 ('Kochi Tuskers Kerala', 4),
 ('Pune Warriors', 2),
 ('Rising Pune Supergiants', 2)]