<a href="https://colab.research.google.com/github/RajuKGosala-45/PySpark-Practice-Journey-With-IPL_Data/blob/main/IPL_Match_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PySpark Setup (for Colab)**

In [None]:
# PySpark Setup (for Colab)
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!pip install -q pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["JAVA_HOME"], "bin")



# **Day 34 -Advanced Pyspark Practices Using IPL_Match_Data**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import*
from pyspark.sql.types import*
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("IPL_Match_Data_Analysis").getOrCreate()
matches_df=spark.read.csv("/content/ipl_matches_data.csv", header=True, inferSchema=True)
players_df=spark.read.csv("/content/players-data-updated.csv", header=True, inferSchema=True)
teams_df=spark.read.csv("/content/teams_data.csv", header=True, inferSchema=True)

# Ipl matches Data Info
print("---Matches_Data Schema---")
matches_df.printSchema()
print("---Matches_Data")
matches_df.show()
print("---Matches_Data Describe")
matches_df.describe().show()

# Players Data Info
print("---Players_Data Schema---")
players_df.printSchema()
print("---Players_Data")
players_df.show()
print("---Players_Data Describe")
players_df.describe().show()

# Teams_Data Info
print("---Teams_Data Schema---")
teams_df.printSchema()
print("---Teams_Data")
teams_df.show()
print("---Teams_Data Describe")
teams_df.describe().show()


---Matches_Data Schema---
root
 |-- match_id: integer (nullable = true)
 |-- season_id: integer (nullable = true)
 |-- balls_per_over: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- match_date: string (nullable = true)
 |-- event_name: string (nullable = true)
 |-- match_number: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- match_type: string (nullable = true)
 |-- format: timestamp (nullable = true)
 |-- overs: integer (nullable = true)
 |-- season: string (nullable = true)
 |-- team_type: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- toss_winner: string (nullable = true)
 |-- team1: string (nullable = true)
 |-- team2: string (nullable = true)
 |-- toss_decision: string (nullable = true)
 |-- match_winner: string (nullable = true)
 |-- win_by_runs: string (nullable = true)
 |-- win_by_wickets: string (nullable = true)
 |-- player_of_match: string (nullable = true)
 |-- result: string (nullable = true)
 |-- stage: string (

## 1.Join Datasets in Pyspark

In [None]:
Ipl_data= matches_df.join(teams_df, matches_df["team1"] == teams_df["team_name"], "left")\
                    .join(players_df, matches_df["player_of_match"] == players_df["player_id"], "left")
Ipl_data.show()

+--------+---------+--------------+---------+----------+--------------------+------------+------+----------+-------------------+-----+------+---------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-----------+--------------+---------------+------+-----+-------+--------------------+---------------+--------------------+---------+---------------+--------------+--------------------+--------------------+--------------------+-------------------+--------------------+
|match_id|season_id|balls_per_over|     city|match_date|          event_name|match_number|gender|match_type|             format|overs|season|team_type|               venue|         toss_winner|               team1|               team2|toss_decision|        match_winner|win_by_runs|win_by_wickets|player_of_match|result|stage|team_id|           team_name|team_name_short|           image_url|player_id|    player_name|     bat_style|          bowl_style|        

## 2. Aggregations - Top Winning Teams

In [None]:
team_wins = matches_df.groupBy("match_winner").agg(count("*").alias("total_wins"))\
.orderBy(desc("total_wins"))
team_wins.show(5)

+--------------------+----------+
|        match_winner|total_wins|
+--------------------+----------+
|      Mumbai Indians|       153|
| Chennai Super Kings|       142|
|Kolkata Knight Ri...|       135|
|Royal Challengers...|       133|
| Sunrisers Hyderabad|       125|
+--------------------+----------+
only showing top 5 rows



## 3.Player Analysis – Most Player of the Match Awards

In [None]:
top_players=players_df.join(matches_df, players_df["player_id"] == matches_df["player_of_match"],"inner")\
                      .groupby("player_name").agg(count("*").alias("MOM_Awards"))\
                      .orderBy(desc("MOM_Awards"))
top_players.show(5)

+--------------+----------+
|   player_name|MOM_Awards|
+--------------+----------+
|AB de Villiers|        25|
|      CH Gayle|        22|
|     RG Sharma|        21|
|       V Kohli|        19|
|     DA Warner|        18|
+--------------+----------+
only showing top 5 rows



## 4.Use Window Functions – Top 3 Players by Avg Runs

In [None]:
window_player = Window.orderBy(desc("mom_awards"))
ranked_players = top_players.withColumn("rank", rank().over(window_player))
ranked_players.filter(col("rank") <= 3).show(truncate=False)

+--------------+----------+----+
|player_name   |MOM_Awards|rank|
+--------------+----------+----+
|AB de Villiers|25        |1   |
|CH Gayle      |22        |2   |
|RG Sharma     |21        |3   |
+--------------+----------+----+



## 5.Team Performance Summary

In [None]:
team_performance = matches_df.groupBy("match_winner").agg(
    count("*").alias("Matches_Won"),
    round(avg("win_by_runs"), 2).alias("Avg_Run_Margin"),
    round(avg("win_by_wickets"), 2).alias("Avg_Wicket_Margin")
).orderBy(desc("Matches_Won"))

team_performance.show(10, truncate=False)

+---------------------------+-----------+--------------+-----------------+
|match_winner               |Matches_Won|Avg_Run_Margin|Avg_Wicket_Margin|
+---------------------------+-----------+--------------+-----------------+
|Mumbai Indians             |153        |34.03         |6.24             |
|Chennai Super Kings        |142        |35.61         |5.93             |
|Kolkata Knight Riders      |135        |33.49         |6.24             |
|Royal Challengers Bangalore|133        |33.03         |6.59             |
|Sunrisers Hyderabad        |125        |26.31         |6.67             |
|Punjab Kings               |123        |24.21         |6.17             |
|Delhi Capitals             |121        |25.62         |6.18             |
|Rajasthan Royals           |116        |30.71         |5.87             |
|Gujarat Titans             |37         |36.47         |6.05             |
|Lucknow Super Giants       |30         |22.1          |5.4              |
+------------------------