In [0]:
spark

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, BooleanType, DateType, DecimalType
from pyspark.sql.functions import col, to_date, length, row_number
from pyspark.sql import Window


In [0]:
from pyspark.sql import SparkSession

#create session
spark = SparkSession.builder.appName("IPL Data Analysis").getOrCreate()

In [0]:
spark

In [0]:
ball_by_ball_df_raw = spark.read.format("csv").option("header", "true").load("s3://ipl-data-analysis-project/Ball_By_Ball.csv")
ball_by_ball_df_raw.printSchema()

root
 |-- MatcH_id: string (nullable = true)
 |-- Over_id: string (nullable = true)
 |-- Ball_id: string (nullable = true)
 |-- Innings_No: string (nullable = true)
 |-- Team_Batting: string (nullable = true)
 |-- Team_Bowling: string (nullable = true)
 |-- Striker_Batting_Position: string (nullable = true)
 |-- Extra_Type: string (nullable = true)
 |-- Runs_Scored: string (nullable = true)
 |-- Extra_runs: string (nullable = true)
 |-- Wides: string (nullable = true)
 |-- Legbyes: string (nullable = true)
 |-- Byes: string (nullable = true)
 |-- Noballs: string (nullable = true)
 |-- Penalty: string (nullable = true)
 |-- Bowler_Extras: string (nullable = true)
 |-- Out_type: string (nullable = true)
 |-- Caught: string (nullable = true)
 |-- Bowled: string (nullable = true)
 |-- Run_out: string (nullable = true)
 |-- LBW: string (nullable = true)
 |-- Retired_hurt: string (nullable = true)
 |-- Stumped: string (nullable = true)
 |-- caught_and_bowled: string (nullable = true)
 |-- hi

In [0]:
ball_by_ball_df = ball_by_ball_df_raw \
    .withColumn("match_id", col("MatcH_id").cast(IntegerType())) \
    .withColumn("over_id", col("Over_id").cast(IntegerType())) \
    .withColumn("ball_id", col("Ball_id").cast(IntegerType())) \
    .withColumn("innings_no", col("Innings_No").cast(IntegerType())) \
    .withColumn("team_batting", col("Team_Batting").cast(StringType())) \
    .withColumn("team_bowling", col("Team_Bowling").cast(StringType())) \
    .withColumn("striker_batting_position", col("Striker_Batting_Position").cast(IntegerType())) \
    .withColumn("extra_type", col("Extra_Type").cast(StringType())) \
    .withColumn("runs_scored", col("Runs_Scored").cast(IntegerType())) \
    .withColumn("extra_runs", col("Extra_runs").cast(IntegerType())) \
    .withColumn("wides", col("Wides").cast(IntegerType())) \
    .withColumn("legbyes", col("Legbyes").cast(IntegerType())) \
    .withColumn("byes", col("Byes").cast(IntegerType())) \
    .withColumn("noballs", col("Noballs").cast(IntegerType())) \
    .withColumn("penalty", col("Penalty").cast(IntegerType())) \
    .withColumn("bowler_extras", col("Bowler_Extras").cast(IntegerType())) \
    .withColumn("out_type", col("Out_type").cast(StringType())) \
    .withColumn("caught", col("Caught").cast(BooleanType())) \
    .withColumn("bowled", col("Bowled").cast(BooleanType())) \
    .withColumn("run_out", col("Run_out").cast(BooleanType())) \
    .withColumn("lbw", col("LBW").cast(BooleanType())) \
    .withColumn("retired_hurt", col("Retired_hurt").cast(BooleanType())) \
    .withColumn("stumped", col("Stumped").cast(BooleanType())) \
    .withColumn("caught_and_bowled", col("caught_and_bowled").cast(BooleanType())) \
    .withColumn("hit_wicket", col("hit_wicket").cast(BooleanType())) \
    .withColumn("obstructingfeild", col("ObstructingFeild").cast(BooleanType())) \
    .withColumn("bowler_wicket", col("Bowler_Wicket").cast(BooleanType())) \
    .withColumn("match_date", to_date(col("match_date"), "M/d/yyyy")) \
    .withColumn("season", col("Season").cast(IntegerType())) \
    .withColumn("striker", col("Striker").cast(IntegerType())) \
    .withColumn("non_striker", col("Non_Striker").cast(IntegerType())) \
    .withColumn("bowler", col("Bowler").cast(IntegerType())) \
    .withColumn("player_out", col("Player_Out").cast(IntegerType())) \
    .withColumn("fielders", col("Fielders").cast(IntegerType())) \
    .withColumn("striker_match_sk", col("Striker_match_SK").cast(IntegerType())) \
    .withColumn("strikersk", col("StrikerSK").cast(IntegerType())) \
    .withColumn("nonstriker_match_sk", col("NonStriker_match_SK").cast(IntegerType())) \
    .withColumn("nonstriker_sk", col("NONStriker_SK").cast(IntegerType())) \
    .withColumn("fielder_match_sk", col("Fielder_match_SK").cast(IntegerType())) \
    .withColumn("fielder_sk", col("Fielder_SK").cast(IntegerType())) \
    .withColumn("bowler_match_sk", col("Bowler_match_SK").cast(IntegerType())) \
    .withColumn("bowler_sk", col("BOWLER_SK").cast(IntegerType())) \
    .withColumn("playerout_match_sk", col("PlayerOut_match_SK").cast(IntegerType())) \
    .withColumn("battingteam_sk", col("BattingTeam_SK").cast(IntegerType())) \
    .withColumn("bowlingteam_sk", col("BowlingTeam_SK").cast(IntegerType())) \
    .withColumn("keeper_catch", col("Keeper_Catch").cast(BooleanType())) \
    .withColumn("player_out_sk", col("Player_out_sk").cast(IntegerType())) \
    .withColumn("matchdatesk", to_date(col("MatchDateSK"), "yyyyMMdd"))

In [0]:
ball_by_ball_df.show(1)

+--------+-------+-------+----------+------------+------------+------------------------+----------+-----------+----------+-----+-------+----+-------+-------+-------------+--------------+------+------+-------+-----+------------+-------+-----------------+----------+----------------+-------------+----------+------+-------+-----------+------+----------+--------+----------------+---------+-------------------+-------------+----------------+----------+---------------+---------+------------------+--------------+--------------+------------+-------------+-----------+
|match_id|over_id|ball_id|innings_no|team_batting|team_bowling|striker_batting_position|extra_type|runs_scored|extra_runs|wides|legbyes|byes|noballs|penalty|bowler_extras|      out_type|caught|bowled|run_out|  lbw|retired_hurt|stumped|caught_and_bowled|hit_wicket|obstructingfeild|bowler_wicket|match_date|season|striker|non_striker|bowler|player_out|fielders|striker_match_sk|strikersk|nonstriker_match_sk|nonstriker_sk|fielder_match_s

In [0]:
match_df_raw = spark.read.format("csv").option("header","true").load("s3://ipl-data-analysis-project/Match.csv")
match_df_raw.printSchema()

root
 |-- Match_SK: string (nullable = true)
 |-- match_id: string (nullable = true)
 |-- Team1: string (nullable = true)
 |-- Team2: string (nullable = true)
 |-- match_date: string (nullable = true)
 |-- Season_Year: string (nullable = true)
 |-- Venue_Name: string (nullable = true)
 |-- City_Name: string (nullable = true)
 |-- Country_Name: string (nullable = true)
 |-- Toss_Winner: string (nullable = true)
 |-- match_winner: string (nullable = true)
 |-- Toss_Name: string (nullable = true)
 |-- Win_Type: string (nullable = true)
 |-- Outcome_Type: string (nullable = true)
 |-- ManOfMach: string (nullable = true)
 |-- Win_Margin: string (nullable = true)
 |-- Country_id: string (nullable = true)



In [0]:
match_df = match_df_raw \
    .withColumn("match_sk", col("Match_SK").cast(IntegerType())) \
    .withColumn("match_id", col("match_id").cast(IntegerType())) \
    .withColumn("match_date", to_date("match_date","M/d/yyyy")) \
    .withColumn("season_year", col("season_year").cast(IntegerType())) \
    .withColumn("win_margin", col("win_margin").cast(IntegerType())) \
    .withColumn("country_id", col("country_id").cast(IntegerType()))


In [0]:
match_df.show(1)

+--------+--------+--------------------+--------------------+----------+-----------+--------------------+---------+------------+--------------------+--------------------+---------+--------+------------+-----------+----------+----------+
|match_sk|match_id|               Team1|               Team2|match_date|season_year|          Venue_Name|City_Name|Country_Name|         Toss_Winner|        match_winner|Toss_Name|Win_Type|Outcome_Type|  ManOfMach|win_margin|country_id|
+--------+--------+--------------------+--------------------+----------+-----------+--------------------+---------+------------+--------------------+--------------------+---------+--------+------------+-----------+----------+----------+
|       0|  335987|Royal Challengers...|Kolkata Knight Ri...|2008-04-18|       2008|M Chinnaswamy Sta...|Bangalore|       India|Royal Challengers...|Kolkata Knight Ri...|    field|    runs|      Result|BB McCullum|       140|         1|
+--------+--------+--------------------+------------

In [0]:
player_df_raw = spark.read.format("csv").option("header","true").option("dateFormat","M/d/yyyy").load("s3://ipl-data-analysis-project/Player.csv")
player_df_raw.printSchema()

root
 |-- PLAYER_SK: string (nullable = true)
 |-- Player_Id: string (nullable = true)
 |-- Player_Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Batting_hand: string (nullable = true)
 |-- Bowling_skill: string (nullable = true)
 |-- Country_Name: string (nullable = true)



In [0]:
player_df = player_df_raw \
    .withColumn("player_sk", col("player_sk").cast(IntegerType())) \
    .withColumn("player_id", col("player_id").cast(IntegerType())) \
    .withColumn("dob", to_date("dob", "M/d/yyyy"))

In [0]:
player_df.show(1)

+---------+---------+-----------+----------+-------------+----------------+------------+
|player_sk|player_id|Player_Name|       dob| Batting_hand|   Bowling_skill|Country_Name|
+---------+---------+-----------+----------+-------------+----------------+------------+
|        0|        1| SC Ganguly|1972-07-08|Left-hand bat|Right-arm medium|       India|
+---------+---------+-----------+----------+-------------+----------------+------------+
only showing top 1 row



In [0]:
player_match_df_raw = spark.read.format("csv").option("header","true").load("s3://ipl-data-analysis-project/Player_match.csv")
player_match_df_raw.printSchema()

root
 |-- Player_match_SK: string (nullable = true)
 |-- PlayerMatch_key: string (nullable = true)
 |-- Match_Id: string (nullable = true)
 |-- Player_Id: string (nullable = true)
 |-- Player_Name: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Batting_hand: string (nullable = true)
 |-- Bowling_skill: string (nullable = true)
 |-- Country_Name: string (nullable = true)
 |-- Role_Desc: string (nullable = true)
 |-- Player_team: string (nullable = true)
 |-- Opposit_Team: string (nullable = true)
 |-- Season_year: string (nullable = true)
 |-- is_manofThematch: string (nullable = true)
 |-- Age_As_on_match: string (nullable = true)
 |-- IsPlayers_Team_won: string (nullable = true)
 |-- Batting_Status: string (nullable = true)
 |-- Bowling_Status: string (nullable = true)
 |-- Player_Captain: string (nullable = true)
 |-- Opposit_captain: string (nullable = true)
 |-- Player_keeper: string (nullable = true)
 |-- Opposit_keeper: string (nullable = true)



In [0]:
player_match_df = player_match_df_raw \
    .withColumn("player_match_sk", col("player_match_sk").cast(IntegerType())) \
    .withColumn("playermatch_key", col("playermatch_key").cast(DecimalType())) \
    .withColumn("match_id", col("match_id").cast(IntegerType())) \
    .withColumn("player_id", col("player_id").cast(IntegerType())) \
    .withColumn("dob", to_date("dob","M/d/yyyy")) \
    .withColumn("season_year", col("season_year").cast(IntegerType())) \
    .withColumn("age_as_on_match", col("age_as_on_match").cast(IntegerType())) \
    .withColumn("is_manofthematch", col("is_manofthematch").cast(BooleanType())) \
    .withColumn("isplayers_team_won", col("isplayers_team_won").cast(BooleanType())) 


In [0]:
player_match_df.show(5)

+---------------+---------------+--------+---------+-----------+----------+--------------+--------------------+------------+---------+--------------------+--------------------+-----------+----------------+---------------+------------------+--------------+--------------+--------------+---------------+-------------+--------------+
|player_match_sk|playermatch_key|match_id|player_id|Player_Name|       dob|  Batting_hand|       Bowling_skill|Country_Name|Role_Desc|         Player_team|        Opposit_Team|season_year|is_manofthematch|age_as_on_match|isplayers_team_won|Batting_Status|Bowling_Status|Player_Captain|Opposit_captain|Player_keeper|Opposit_keeper|
+---------------+---------------+--------+---------+-----------+----------+--------------+--------------------+------------+---------+--------------------+--------------------+-----------+----------------+---------------+------------------+--------------+--------------+--------------+---------------+-------------+--------------+
|      

In [0]:
team_df_raw = spark.read.format("csv").option("header","true").load("s3://ipl-data-analysis-project/Team.csv")
team_df_raw.printSchema()

root
 |-- Team_SK: string (nullable = true)
 |-- Team_Id: string (nullable = true)
 |-- Team_Name: string (nullable = true)



In [0]:
team_df = team_df_raw \
    .withColumn("team_sk", col("team_sk").cast(IntegerType())) \
    .withColumn("team_id", col("team_id").cast(IntegerType()))

In [0]:
team_df.show(5)

+-------+-------+--------------------+
|team_sk|team_id|           Team_Name|
+-------+-------+--------------------+
|      0|      1|Kolkata Knight Ri...|
|      1|      2|Royal Challengers...|
|      2|      3| Chennai Super Kings|
|      3|      4|     Kings XI Punjab|
|      4|      5|    Rajasthan Royals|
+-------+-------+--------------------+
only showing top 5 rows



In [0]:
# No of bowler extras by each team per season

joined_df = ball_by_ball_df.join(team_df, ball_by_ball_df["team_bowling"] == team_df["team_id"], "inner")
grouped_df = joined_df.groupBy("team_name", "season").sum("bowler_extras").withColumnRenamed("sum(bowler_extras)", "total_bowler_extras")
sorted_df = grouped_df.orderBy(grouped_df.season.asc(), grouped_df.total_bowler_extras.desc()).show()


+--------------------+------+-------------------+
|           team_name|season|total_bowler_extras|
+--------------------+------+-------------------+
| Chennai Super Kings|  2008|                 91|
|Kolkata Knight Ri...|  2008|                 91|
|     Kings XI Punjab|  2008|                 90|
|Royal Challengers...|  2008|                 89|
|      Mumbai Indians|  2008|                 84|
|     Deccan Chargers|  2008|                 82|
|    Rajasthan Royals|  2008|                 82|
|    Delhi Daredevils|  2008|                 69|
|Kolkata Knight Ri...|  2009|                 94|
|    Delhi Daredevils|  2009|                 92|
|      Mumbai Indians|  2009|                 88|
|    Rajasthan Royals|  2009|                 82|
|Royal Challengers...|  2009|                 77|
|     Deccan Chargers|  2009|                 73|
|     Kings XI Punjab|  2009|                 61|
| Chennai Super Kings|  2009|                 34|
|     Kings XI Punjab|  2010|                107|


In [0]:
# No of extra runs by each team per season

joined_df = ball_by_ball_df.join(team_df, ball_by_ball_df["team_batting"] == team_df["team_id"], "inner")
grouped_df = joined_df.groupBy("team_name", "season").sum("extra_runs").withColumnRenamed("sum(extra_runs)", "total_extra_runs")
sorted_df = grouped_df.orderBy(grouped_df.season.asc(), grouped_df.total_extra_runs.desc()).show()

+--------------------+------+----------------+
|           team_name|season|total_extra_runs|
+--------------------+------+----------------+
|      Mumbai Indians|  2008|             183|
|    Rajasthan Royals|  2008|             177|
| Chennai Super Kings|  2008|             155|
|Kolkata Knight Ri...|  2008|             141|
|     Kings XI Punjab|  2008|             124|
|    Delhi Daredevils|  2008|             118|
|Royal Challengers...|  2008|             118|
|     Deccan Chargers|  2008|             112|
|     Kings XI Punjab|  2009|             160|
| Chennai Super Kings|  2009|             150|
|      Mumbai Indians|  2009|             135|
|     Deccan Chargers|  2009|             125|
|Royal Challengers...|  2009|             114|
|    Delhi Daredevils|  2009|             110|
|    Rajasthan Royals|  2009|              93|
|Kolkata Knight Ri...|  2009|              90|
|      Mumbai Indians|  2010|             194|
|Kolkata Knight Ri...|  2010|             166|
| Chennai Sup

In [0]:
# No of extras by top 5 bowlers each season

grouped_df = ball_by_ball_df.groupBy("bowler", "season").sum("bowler_extras").withColumnRenamed("sum(bowler_extras)", "total_bowler_extras")
joined_df = grouped_df.join(player_df, grouped_df["bowler"] == player_df["player_id"], "inner")
select_df = joined_df.select("player_name", "country_name", "season", "total_bowler_extras")
window_spec = Window.partitionBy("season").orderBy(select_df.total_bowler_extras.desc())
ranked_df = select_df.withColumn("rank", row_number().over(window_spec))
top_bowlers_df = ranked_df.filter(ranked_df.rank <= 5).show()



+--------------+------------+------+-------------------+----+
|   player_name|country_name|season|total_bowler_extras|rank|
+--------------+------------+------+-------------------+----+
|   S Sreesanth|       India|  2008|                 34|   1|
|     JA Morkel|South Africa|  2008|                 26|   2|
|M Muralitharan|   Sri Lanka|  2008|                 26|   3|
|      DW Steyn|South Africa|  2008|                 24|   4|
|      I Sharma|       India|  2008|                 21|   5|
|    SL Malinga|   Sri Lanka|  2009|                 31|   1|
|      RP Singh|       India|  2009|                 24|   2|
|     DP Nannes|   Australia|  2009|                 23|   3|
|       A Nehra|       India|  2009|                 20|   4|
|     JH Kallis|South Africa|  2009|                 19|   5|
|      DW Steyn|South Africa|  2010|                 29|   1|
|       SW Tait|   Australia|  2010|                 28|   2|
|    SL Malinga|   Sri Lanka|  2010|                 24|   3|
|       

In [0]:
# No of sixes by top 5 batters each season

filter_sixes_df = ball_by_ball_df.filter(col("runs_scored") == 6)
grouped_df = filter_sixes_df.groupBy("striker", "season").count().withColumnRenamed("count", "total_sixes")
joined_df = grouped_df.join(player_df, grouped_df["striker"] == player_df["player_id"] , "inner")
select_df = joined_df.select("player_name", "country_name", "season", "total_sixes")
window_spec = Window.partitionBy("season").orderBy(select_df.total_sixes.desc())
ranked_df = select_df.withColumn("rank", row_number().over(window_spec))
top_batters = ranked_df.filter(ranked_df.rank <= 5).show()


+-------------+------------+------+-----------+----+
|  player_name|country_name|season|total_sixes|rank|
+-------------+------------+------+-----------+----+
|ST Jayasuriya|   Sri Lanka|  2008|         31|   1|
|     SE Marsh|   Australia|  2008|         26|   2|
|    YK Pathan|       India|  2008|         25|   3|
|     V Sehwag|       India|  2008|         21|   4|
| AC Gilchrist|   Australia|  2008|         19|   5|
| AC Gilchrist|   Australia|  2009|         29|   1|
|    ML Hayden|   Australia|  2009|         22|   2|
|     SK Raina|       India|  2009|         21|   3|
|    RG Sharma|       India|  2009|         18|   4|
| Yuvraj Singh|       India|  2009|         16|   5|
|   RV Uthappa|       India|  2010|         27|   1|
|      M Vijay|       India|  2010|         26|   2|
|    YK Pathan|       India|  2010|         24|   3|
|     SK Raina|       India|  2010|         23|   4|
|    SS Tiwary|       India|  2010|         18|   5|
|     CH Gayle| West Indies|  2011|         44