In [1]:
import os
import sys

# Set Spark home
os.environ['SPARK_HOME'] = 'C:\\spark\\spark-3.5.6-bin-hadoop3'

# Add PySpark to Python path
sys.path.insert(0, os.path.join(os.environ['SPARK_HOME'], 'python'))
sys.path.insert(0, os.path.join(os.environ['SPARK_HOME'], 'python\\lib\\py4j-0.10.9.7-src.zip'))

# Initialize Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("JupyterSpark") \
    .getOrCreate()

sc = spark.sparkContext

print("Spark ready!")

Spark ready!


In [37]:
# ipl_pyspark_analysis.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, year, month, to_date
from pyspark.sql.functions import desc
from pyspark.sql.functions import *
import pandas as pd

In [12]:
spark = SparkSession.builder.appName("IPL2022 Analysis").getOrCreate()


print("Current directory:", os.getcwd())
print("Files in directory:")
for file in os.listdir('.'):
    print(f"  - {file}")

# Look for IPL files
ipl_files = [f for f in os.listdir('.') if 'ipl' in f.lower()]
print(f"\nIPL files found: {ipl_files}")


Current directory: C:\Users\kallu\BDA
Files in directory:
  - .ipynb_checkpoints
  - F1.ipynb
  - Formula1.csv
  - ipl2022.csv
  - ipl2022.ipynb
  - students.csv
  - week10.ipynb
  - week11.ipynb
  - week6.ipynb
  - week7.ipynb
  - week8.ipynb
  - week9.ipynb

IPL files found: ['ipl2022.csv', 'ipl2022.ipynb']


In [27]:
# Option 1: If file is in current directory (most likely)
input_path = "ipl2022.csv"  # or "ipl2022.csv" or whatever the actual filename is


try:
    # Read with pandas first
    pandas_df = pd.read_csv("ipl2022.csv")
    
    # Convert to Spark DataFrame
    df= spark.createDataFrame(pandas_df)
    
    print("‚úÖ IPL dataset loaded via pandas successfully!")
    print(f"Dataset shape: ({df_ipl.count()}, {len(df_ipl.columns)})")
    df.show(5)
    
except Exception as e:
    print(f"‚ùå Error: {e}")



‚úÖ IPL dataset loaded via pandas successfully!
Dataset shape: (17912, 17)
+-------+-------+-----+----------+-----------+--------------+-----------+----------+-----------+----------+---------+------------+----------------+----------+----+-----------------+----------------+
|     ID|innings|overs|ballnumber|     batter|        bowler|non-striker|extra_type|batsman_run|extras_run|total_run|non_boundary|isWicketDelivery|player_out|kind|fielders_involved|     BattingTeam|
+-------+-------+-----+----------+-----------+--------------+-----------+----------+-----------+----------+---------+------------+----------------+----------+----+-----------------+----------------+
|1312200|      1|    0|         1|YBK Jaiswal|Mohammed Shami| JC Buttler|       NaN|          0|         0|        0|           0|               0|       NaN| NaN|              NaN|Rajasthan Royals|
|1312200|      1|    0|         2|YBK Jaiswal|Mohammed Shami| JC Buttler|   legbyes|          0|         1|        1|           0

In [28]:
df.printSchema()

# ‚úÖ Show first 10 rows
df.show(10, truncate=False)

root
 |-- ID: long (nullable = true)
 |-- innings: long (nullable = true)
 |-- overs: long (nullable = true)
 |-- ballnumber: long (nullable = true)
 |-- batter: string (nullable = true)
 |-- bowler: string (nullable = true)
 |-- non-striker: string (nullable = true)
 |-- extra_type: string (nullable = true)
 |-- batsman_run: long (nullable = true)
 |-- extras_run: long (nullable = true)
 |-- total_run: long (nullable = true)
 |-- non_boundary: long (nullable = true)
 |-- isWicketDelivery: long (nullable = true)
 |-- player_out: string (nullable = true)
 |-- kind: string (nullable = true)
 |-- fielders_involved: string (nullable = true)
 |-- BattingTeam: string (nullable = true)

+-------+-------+-----+----------+-----------+--------------+-----------+----------+-----------+----------+---------+------------+----------------+----------+----+-----------------+----------------+
|ID     |innings|overs|ballnumber|batter     |bowler        |non-striker|extra_type|batsman_run|extras_run|total

In [30]:
print(f"Total matches: {df.count()}")
print(f"Total columns: {len(df.columns)}")

Total matches: 17912
Total columns: 17


In [40]:
print("=== üèè SIMPLE IPL ANALYSIS ===")

# 1. Basic Info
print(f"üìä Total deliveries: {df.count()}")

# 2. Top Batters (by runs scored)
print("\nüî• TOP 10 BATTERS:")
top_batters = df.groupBy("batter") \
    .agg(sum("batsman_run").alias("total_runs"), 
         count("*").alias("balls_faced")) \
    .orderBy(desc("total_runs")) \
    .limit(10)
top_batters.show(truncate=False)



=== üèè SIMPLE IPL ANALYSIS ===
üìä Total deliveries: 17912

üî• TOP 10 BATTERS:
+------------+----------+-----------+
|batter      |total_runs|balls_faced|
+------------+----------+-----------+
|JC Buttler  |863       |596        |
|KL Rahul    |616       |472        |
|Q de Kock   |508       |350        |
|HH Pandya   |487       |387        |
|Shubman Gill|483       |374        |
|DA Miller   |481       |348        |
|F du Plessis|468       |377        |
|S Dhawan    |460       |395        |
|SV Samson   |458       |321        |
|DJ Hooda    |451       |340        |
+------------+----------+-----------+



In [42]:
# 3. Top Bowlers (by wickets taken)
print("\nüéØ TOP 10 BOWLERS:")
if "isWicketDelivery" in df.columns:
    top_bowlers = df.filter(col("isWicketDelivery") == 1) \
        .groupBy("bowler") \
        .agg(count("*").alias("wickets")) \
        .orderBy(desc("wickets")) \
        .limit(10)
    top_bowlers.show(truncate=False)




üéØ TOP 10 BOWLERS:
+-----------------+-------+
|bowler           |wickets|
+-----------------+-------+
|YS Chahal        |29     |
|PWH de Silva     |27     |
|K Rabada         |23     |
|Umran Malik      |23     |
|M Prasidh Krishna|21     |
|HV Patel         |21     |
|Mohammed Shami   |21     |
|Kuldeep Yadav    |21     |
|JR Hazlewood     |21     |
|AD Russell       |21     |
+-----------------+-------+



In [43]:
# 4. Team Batting Performance
print("\nüèè TEAM BATTING:")
if "BattingTeam" in df.columns:
    team_batting = df.groupBy("BattingTeam") \
        .agg(sum("total_run").alias("total_runs"),
             count("*").alias("balls_bowled")) \
        .orderBy(desc("total_runs"))
    team_batting.show(truncate=False)




üèè TEAM BATTING:
+---------------------------+----------+------------+
|BattingTeam                |total_runs|balls_bowled|
+---------------------------+----------+------------+
|Rajasthan Royals           |2943      |2107        |
|Gujarat Titans             |2663      |1971        |
|Royal Challengers Bangalore|2632      |1965        |
|Lucknow Super Giants       |2548      |1840        |
|Punjab Kings               |2343      |1712        |
|Delhi Capitals             |2341      |1650        |
|Chennai Super Kings        |2288      |1719        |
|Kolkata Knight Riders      |2223      |1634        |
|Mumbai Indians             |2217      |1691        |
|Sunrisers Hyderabad        |2197      |1623        |
+---------------------------+----------+------------+



In [44]:
# 5. Most Common Dismissal Types
print("\nüìà DISMISSAL TYPES:")
if "kind" in df.columns:
    dismissals = df.filter(col("isWicketDelivery") == 1) \
        .groupBy("kind") \
        .agg(count("*").alias("count")) \
        .orderBy(desc("count"))
    dismissals.show(truncate=False)



üìà DISMISSAL TYPES:
+-----------------+-----+
|kind             |count|
+-----------------+-----+
|caught           |625  |
|bowled           |123  |
|run out          |61   |
|lbw              |57   |
|caught and bowled|25   |
|stumped          |18   |
|hit wicket       |1    |
|retired out      |1    |
|retired hurt     |1    |
+-----------------+-----+



In [45]:

print("‚úÖ Analysis Complete!")
spark.stop()

‚úÖ Analysis Complete!
