<a href="https://colab.research.google.com/github/RajuKGosala-45/PySpark-Practice-Journey-With-IPL_Data/blob/main/IPL_Match_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PySpark Setup (for Colab)**

In [1]:
# PySpark Setup (for Colab)
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install -q pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["JAVA_HOME"], "bin")



# **Day 34 -Advanced Pyspark Practices Using IPL_Match_Data**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import*
from pyspark.sql.types import*
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("IPL_Match_Data_Analysis").getOrCreate()
matches_df=spark.read.csv("/content/ipl_matches_data.csv", header=True, inferSchema=True)
players_df=spark.read.csv("/content/players-data-updated.csv", header=True, inferSchema=True)
teams_df=spark.read.csv("/content/teams_data.csv", header=True, inferSchema=True)

# Ipl matches Data Info
print("---Matches_Data Schema---")
matches_df.printSchema()
print("---Matches_Data")
matches_df.show()
print("---Matches_Data Describe")
matches_df.describe().show()

# Players Data Info
print("---Players_Data Schema---")
players_df.printSchema()
print("---Players_Data")
players_df.show()
print("---Players_Data Describe")
players_df.describe().show()

# Teams_Data Info
print("---Teams_Data Schema---")
teams_df.printSchema()
print("---Teams_Data")
teams_df.show()
print("---Teams_Data Describe")
teams_df.describe().show()


---Matches_Data Schema---
root
 |-- match_id: integer (nullable = true)
 |-- season_id: integer (nullable = true)
 |-- balls_per_over: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- match_date: string (nullable = true)
 |-- event_name: string (nullable = true)
 |-- match_number: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- format: timestamp (nullable = true)
 |-- overs: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- team_type: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- toss_winner: string (nullable = true)
 |-- team1: string (nullable = true)
 |-- team2: string (nullable = true)
 |-- toss_decision: string (nullable = true)
 |-- match_winner: string (nullable = true)
 |-- win_by_runs: string (nullable = true)
 |-- win_by_wickets: string (nullable = true)
 |-- player_of_match: string (nullable = true)
 |-- result: string (nullable = true)
 |-- stage: string (

## 1.Join Datasets in Pyspark

In [None]:
Ipl_data= matches_df.join(teams_df, matches_df["team1"] == teams_df["team_name"], "left")\
                    .join(players_df, matches_df["player_of_match"] == players_df["player_id"], "left")
Ipl_data.show()

+--------+---------+--------------+---------+----------+--------------------+------------+------+----------+-------------------+-----+------+---------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-----------+--------------+---------------+------+-----+-------+--------------------+---------------+--------------------+---------+---------------+--------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|match_id|season_id|balls_per_over|     city|match_date|          event_name|match_number|gender|match_type|             format|overs|season|team_type|               venue|         toss_winner|               team1|               team2|toss_decision|        match_winner|win_by_runs|win_by_wickets|player_of_match|result|stage|team_id|           team_name|team_name_short|           image_url|player_id|    player_name|     bat_style|          bowl_style|        

## 2. Aggregations - Top Winning Teams

In [None]:
team_wins = matches_df.groupBy("match_winner").agg(count("*").alias("total_wins"))\
.orderBy(desc("total_wins"))
team_wins.show(5)

+--------------------+----------+
|        match_winner|total_wins|
+--------------------+----------+
|      Mumbai Indians|       153|
| Chennai Super Kings|       142|
|Kolkata Knight Ri...|       135|
|Royal Challengers...|       133|
| Sunrisers Hyderabad|       125|
+--------------------+----------+
only showing top 5 rows



## 3.Player Analysis – Most Player of the Match Awards

In [None]:
top_players=players_df.join(matches_df, players_df["player_id"] == matches_df["player_of_match"],"inner")\
                      .groupby("player_name").agg(count("*").alias("MOM_Awards"))\
                      .orderBy(desc("MOM_Awards"))
top_players.show(5)

+--------------+----------+
|   player_name|MOM_Awards|
+--------------+----------+
|AB de Villiers|        25|
|      CH Gayle|        22|
|     RG Sharma|        21|
|       V Kohli|        19|
|     DA Warner|        18|
+--------------+----------+
only showing top 5 rows



## 4.Use Window Functions – Top 3 Players by Avg Runs

In [None]:
window_player = Window.orderBy(desc("mom_awards"))
ranked_players = top_players.withColumn("rank", rank().over(window_player))
ranked_players.filter(col("rank") <= 3).show(truncate=False)

+--------------+----------+----+
|player_name   |MOM_Awards|rank|
+--------------+----------+----+
|AB de Villiers|25        |1   |
|CH Gayle      |22        |2   |
|RG Sharma     |21        |3   |
+--------------+----------+----+



## 5.Team Performance Summary

In [None]:
team_performance = matches_df.groupBy("match_winner").agg(
    count("*").alias("Matches_Won"),
    round(avg("win_by_runs"), 2).alias("Avg_Run_Margin"),
    round(avg("win_by_wickets"), 2).alias("Avg_Wicket_Margin")
).orderBy(desc("Matches_Won"))

team_performance.show(10, truncate=False)

+---------------------------+-----------+--------------+-----------------+
|match_winner               |Matches_Won|Avg_Run_Margin|Avg_Wicket_Margin|
+---------------------------+-----------+--------------+-----------------+
|Mumbai Indians             |153        |34.03         |6.24             |
|Chennai Super Kings        |142        |35.61         |5.93             |
|Kolkata Knight Riders      |135        |33.49         |6.24             |
|Royal Challengers Bangalore|133        |33.03         |6.59             |
|Sunrisers Hyderabad        |125        |26.31         |6.67             |
|Punjab Kings               |123        |24.21         |6.17             |
|Delhi Capitals             |121        |25.62         |6.18             |
|Rajasthan Royals           |116        |30.71         |5.87             |
|Gujarat Titans             |37         |36.47         |6.05             |
|Lucknow Super Giants       |30         |22.1          |5.4              |
+------------------------

# **35.Best Practices of Advanced Joins + Real-World Analysis**

### *1.Find the Top 5 Most Dominant Teams (Based on Win %)*

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark=SparkSession.builder.appName("IPL_Analytics").getOrCreate()
matches_df=spark.read.csv("/content/ipl_matches_data.csv", header=True, inferSchema=True)
players_df=spark.read.csv("/content/players-data-updated.csv", header=True, inferSchema=True)
teams_df=spark.read.csv("/content/teams_data.csv", header=True, inferSchema=True)

ipl_data =matches_df.join(teams_df, matches_df["team1"]== teams_df["team_name"], "left")\
                    .join(players_df, matches_df["player_of_match"]== players_df["player_id"],"left")

team_stats = (
    ipl_data.groupBy("match_winner").agg(
        count("*").alias("wins")
    )
    .join(
        ipl_data.select("team1").union(ipl_data.select("team2"))
              .groupBy("team1").count()
              .withColumnRenamed("team1", "team")
              .withColumnRenamed("count", "total_matches"),
        col("match_winner") == col("team"),
        "inner"
    )
    .withColumn("win_percentage", round((col("wins") / col("total_matches")) * 100, 2))
    .orderBy(col("win_percentage").desc())
)

team_stats.show(5)

+--------------------+----+--------------------+-------------+--------------+
|        match_winner|wins|                team|total_matches|win_percentage|
+--------------------+----+--------------------+-------------+--------------+
|      Gujarat Titans|  37|      Gujarat Titans|           60|         61.67|
| Chennai Super Kings| 142| Chennai Super Kings|          252|         56.35|
|      Mumbai Indians| 153|      Mumbai Indians|          277|         55.23|
|Lucknow Super Giants|  30|Lucknow Super Giants|           58|         51.72|
|Kolkata Knight Ri...| 135|Kolkata Knight Ri...|          265|         50.94|
+--------------------+----+--------------------+-------------+--------------+
only showing top 5 rows



### 2.Most Toss Winners vs Actual Match Winners(Correlation Check)

In [None]:
toss_vs_match =(
    ipl_data.withColumn("toss_match_same",
                        (col("toss_winner") == col("match_winner"))
                        .cast("int"))
)
toss_vs_match.groupBy().avg("toss_match_same").show()

+--------------------+
|avg(toss_match_same)|
+--------------------+
|  0.5150732127476314|
+--------------------+



### 3.strongest cities - Most matches Hosted

In [None]:
city_df=(
    ipl_data.groupBy("city").count()
    .orderBy(col("count").desc())
)
city_df.show(10)

+----------+-----+
|      city|count|
+----------+-----+
|    Mumbai|  180|
|   Kolkata|  100|
|     Delhi|   97|
|   Chennai|   91|
| Hyderabad|   83|
| Bangalore|   65|
|    Jaipur|   64|
|Chandigarh|   61|
|      Pune|   51|
|      NULL|   51|
+----------+-----+
only showing top 10 rows



### 4.Top Bowlers Based on Player of Match Awards

In [None]:
bowler_awards = (
    ipl_data.filter((col("bowl_style").isNotNull()) & (col("bowl_style") != ""))
    .groupBy("player_full_name")
    .count()
    .orderBy(col("count").desc())
)
bowler_awards.show(10, truncate=False)



+-------------------------------+-----+
|player_full_name               |count|
+-------------------------------+-----+
|Abraham Benjamin de Villiers   |25   |
|Christopher Henry Gayle        |22   |
|Rohit Gurunath Sharma          |21   |
|Virat Kohli                    |19   |
|David Andrew Warner            |18   |
|Mahendra Singh Dhoni           |18   |
|Sunil Philip Narine            |17   |
|Andre Dwayne Russell           |16   |
|Ravindrasinh Anirudhsinh Jadeja|16   |
|Yusuf Khan Pathan              |16   |
+-------------------------------+-----+
only showing top 10 rows



### 5. Compare Home vs Away Team Performance

In [None]:
home_away=(
    ipl_data.groupBy("match_winner", "city")
    .count()
    .orderBy(col("count").desc())
)
home_away.show(truncate=False)

+---------------------------+----------+-----+
|match_winner               |city      |count|
+---------------------------+----------+-----+
|Mumbai Indians             |Mumbai    |68   |
|Kolkata Knight Riders      |Kolkata   |54   |
|Chennai Super Kings        |Chennai   |51   |
|Sunrisers Hyderabad        |Hyderabad |40   |
|Rajasthan Royals           |Jaipur    |38   |
|Delhi Capitals             |Delhi     |38   |
|Punjab Kings               |Chandigarh|31   |
|Royal Challengers Bangalore|Bangalore |28   |
|Chennai Super Kings        |Mumbai    |18   |
|Rajasthan Royals           |Mumbai    |17   |
|Royal Challengers Bangalore|Bengaluru |17   |
|Royal Challengers Bangalore|Mumbai    |13   |
|Gujarat Titans             |Ahmedabad |13   |
|Sunrisers Hyderabad        |Mumbai    |12   |
|Punjab Kings               |Mumbai    |12   |
|Delhi Capitals             |Mumbai    |11   |
|Sunrisers Hyderabad        |Delhi     |11   |
|Mumbai Indians             |Kolkata   |10   |
|Punjab Kings

# **36.Advanced Window Functions (Ranking + Running Totals + Lead/Lag)**

### 1. Rank Teams by Wins (Dense Rank)

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark=SparkSession.builder.appName("IPL_Analytics").getOrCreate()
matches_df=spark.read.csv("/content/ipl_matches_data.csv", header=True, inferSchema=True)
players_df=spark.read.csv("/content/players-data-updated.csv", header=True, inferSchema=True)
teams_df=spark.read.csv("/content/teams_data.csv", header=True, inferSchema=True)

ipl_data=matches_df.join(players_df, matches_df["player_of_match"] == players_df["player_id"], "left")\
                   .join(teams_df, matches_df["team1"] == teams_df["team_name"], "left")

win_count=ipl_data.groupBy("match_winner").agg(count("*").alias("wins")
)
WindowSpec=Window.orderBy(win_count.wins.desc())
ranked_teams=win_count.withColumn("rank", dense_rank().over(WindowSpec))
ranked_teams.show(truncate=False)



+---------------------------+----+----+
|match_winner               |wins|rank|
+---------------------------+----+----+
|Mumbai Indians             |153 |1   |
|Chennai Super Kings        |142 |2   |
|Kolkata Knight Riders      |135 |3   |
|Royal Challengers Bangalore|133 |4   |
|Sunrisers Hyderabad        |125 |5   |
|Punjab Kings               |123 |6   |
|Delhi Capitals             |121 |7   |
|Rajasthan Royals           |116 |8   |
|Gujarat Titans             |37  |9   |
|Lucknow Super Giants       |30  |10  |
|Rising Pune Supergiant     |15  |11  |
|Gujarat Lions              |13  |12  |
|Pune Warriors              |12  |13  |
|NULL                       |8   |14  |
|Kochi Tuskers Kerala       |6   |15  |
+---------------------------+----+----+



### 2. Rolling Match Count per Team (Running Total)

In [4]:
window_team=Window.partitionBy("match_winner").orderBy("match_winner")
running_total= (
    ipl_data.withColumn("runnin_win_number",
                        row_number().over(window_team))
)
running_total.select("match_winner","match_date", "runnin_win_number").show(truncate=False)

+-------------------+----------+-----------------+
|match_winner       |match_date|runnin_win_number|
+-------------------+----------+-----------------+
|NULL               |30-04-2019|1                |
|NULL               |03-05-2023|2                |
|NULL               |21-05-2011|3                |
|NULL               |29-04-2015|4                |
|NULL               |17-05-2015|5                |
|NULL               |26-04-2025|6                |
|NULL               |05-05-2025|7                |
|NULL               |17-05-2025|8                |
|Chennai Super Kings|07-04-2018|1                |
|Chennai Super Kings|10-04-2018|2                |
|Chennai Super Kings|20-04-2018|3                |
|Chennai Super Kings|22-04-2018|4                |
|Chennai Super Kings|25-04-2018|5                |
|Chennai Super Kings|30-04-2018|6                |
|Chennai Super Kings|05-05-2018|7                |
|Chennai Super Kings|13-05-2018|8                |
|Chennai Super Kings|20-05-2018

### 3.Lag Analysis: Was Previous Match Won?

In [7]:
win_flag = (
    ipl_data.withColumn("is_win", (ipl_data.match_winner == ipl_data.team1).cast("int"))
          .withColumn("previous_game_win",
                      lag("is_win", 1).over(Window.partitionBy("team1").orderBy("match_date")))
)

win_flag.show(10, truncate=False)

+--------+---------+--------------+------------+----------+---------------------+------------+------+----------+-------------------+-----+------+---------+---------------------------------------------+-------------------+-------------------+---------------------+-------------+-------------------+-----------+--------------+---------------+------+-----+---------+-------------+--------------+----------------------+------------+-------------------------------+------------------------+----------------------------------------------------------------------------------------------------+-------+-------------------+---------------+---------------------------------------------------------------------+------+-----------------+
|match_id|season_id|balls_per_over|city        |match_date|event_name           |match_number|gender|match_type|format             |overs|season|team_type|venue                                        |toss_winner        |team1              |team2                |toss_decisi

### 4. Lead Function: Who Will They Face Next?

In [8]:
next_opponent=(
    ipl_data.withColumn("Next_team",
                        lead("team2", 1).over(Window.partitionBy("team1").orderBy("match_date"))
))
next_opponent.select("team1", "team2", "Next_team").show(truncate=False)

+-------------------+---------------------------+---------------------------+
|team1              |team2                      |Next_team                  |
+-------------------+---------------------------+---------------------------+
|Chennai Super Kings|Sunrisers Hyderabad        |Delhi Capitals             |
|Chennai Super Kings|Delhi Capitals             |Mumbai Indians             |
|Chennai Super Kings|Mumbai Indians             |Sunrisers Hyderabad        |
|Chennai Super Kings|Sunrisers Hyderabad        |Punjab Kings               |
|Chennai Super Kings|Punjab Kings               |Rajasthan Royals           |
|Chennai Super Kings|Rajasthan Royals           |Delhi Capitals             |
|Chennai Super Kings|Delhi Capitals             |Delhi Capitals             |
|Chennai Super Kings|Delhi Capitals             |Punjab Kings               |
|Chennai Super Kings|Punjab Kings               |Kolkata Knight Riders      |
|Chennai Super Kings|Kolkata Knight Riders      |Rajasthan Royal

### 5. Most Consistent Teams (Rolling Win Rate)

In [10]:

win_rate_spec = Window.partitionBy("match_winner").orderBy("match_date").rowsBetween(Window.unboundedPreceding, 0)

rolling_rate = (
    ipl_data.withColumn("win_flag", (col("match_winner").isNotNull()).cast("int"))
          .withColumn("rolling_win_rate",
                      avg("win_flag").over(win_rate_spec))
)

rolling_rate.select("match_winner", "match_date", "rolling_win_rate").show()


+-------------------+----------+----------------+
|       match_winner|match_date|rolling_win_rate|
+-------------------+----------+----------------+
|               NULL|03-05-2023|             0.0|
|               NULL|05-05-2025|             0.0|
|               NULL|17-05-2015|             0.0|
|               NULL|17-05-2025|             0.0|
|               NULL|21-05-2011|             0.0|
|               NULL|26-04-2025|             0.0|
|               NULL|29-04-2015|             0.0|
|               NULL|30-04-2019|             0.0|
|Chennai Super Kings|01-05-2011|             1.0|
|Chennai Super Kings|01-05-2019|             1.0|
|Chennai Super Kings|01-05-2022|             1.0|
|Chennai Super Kings|01-11-2020|             1.0|
|Chennai Super Kings|02-05-2009|             1.0|
|Chennai Super Kings|02-05-2013|             1.0|
|Chennai Super Kings|02-05-2014|             1.0|
|Chennai Super Kings|03-04-2010|             1.0|
|Chennai Super Kings|03-04-2023|             1.0|
