In [1]:
!pip install pyspark



In [2]:
!pip install -q findspark

In [3]:
import findspark
findspark.init()

In [4]:
# creating a spark session that will be used to perform all the necessary task on Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('df-functions-solutions').getOrCreate()
spark

# EXERCISE 1: WINDOW FUNCTIONS

**1. Create a DataFrame with specific columns and datatypes.**

**2. Insert records**

**3. Show dataframe.**

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Defining the schema
schema = StructType([
    StructField("player_name", StringType(), True),
    StructField("team_name", StringType(), True),
    StructField("matches_played", IntegerType(), True),
    StructField("runs_scored", IntegerType(), True)
])

# data for the dataframe
data = [
   ("Tim David", "Singapore", 41, 1260),
   ("Peter Clarke", "Zimbabwe", 3, 45),
   ("Devon Conway", "South Africa", 77, 2567),
   ("Glenn Philips", "South Africa", 48, 1126),
   ("Eoin Morgan", "Ireland", 215, 4015),
   ("Rachin Ravindra", "India", 58, 1789),
   ("Sam Curran", "Zimbabwe", 97, 1673),
   ("Jason Sangha", "India", 19, 153),
   ("Brandon Taylor", "Zimbabwe", 49, 1673)
 ]

# Creating the DataFrame
df1 = spark.createDataFrame(data, schema=schema)

df1.show()

+---------------+------------+--------------+-----------+
|    player_name|   team_name|matches_played|runs_scored|
+---------------+------------+--------------+-----------+
|      Tim David|   Singapore|            41|       1260|
|   Peter Clarke|    Zimbabwe|             3|         45|
|   Devon Conway|South Africa|            77|       2567|
|  Glenn Philips|South Africa|            48|       1126|
|    Eoin Morgan|     Ireland|           215|       4015|
|Rachin Ravindra|       India|            58|       1789|
|     Sam Curran|    Zimbabwe|            97|       1673|
|   Jason Sangha|       India|            19|        153|
| Brandon Taylor|    Zimbabwe|            49|       1673|
+---------------+------------+--------------+-----------+



**4. Add a new column `row_num` that assigns a unique row number for each `team_name` ordered by `runs_scored` in descending order.**

**5. Show the dataframe.**

In [6]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

runs_scored_by_team_window = Window.partitionBy("team_name").orderBy(df1.runs_scored.desc())
df1 = df1.withColumn("row_num", F.row_number().over(runs_scored_by_team_window))

df1.show()

+---------------+------------+--------------+-----------+-------+
|    player_name|   team_name|matches_played|runs_scored|row_num|
+---------------+------------+--------------+-----------+-------+
|Rachin Ravindra|       India|            58|       1789|      1|
|   Jason Sangha|       India|            19|        153|      2|
|    Eoin Morgan|     Ireland|           215|       4015|      1|
|      Tim David|   Singapore|            41|       1260|      1|
|   Devon Conway|South Africa|            77|       2567|      1|
|  Glenn Philips|South Africa|            48|       1126|      2|
|     Sam Curran|    Zimbabwe|            97|       1673|      1|
| Brandon Taylor|    Zimbabwe|            49|       1673|      2|
|   Peter Clarke|    Zimbabwe|             3|         45|      3|
+---------------+------------+--------------+-----------+-------+



**6. Add a new column `rank_v1` that ranks players within their `team_name` based on `runs_scored` (ties will have the same rank, but gaps exist).**

**7. Show the Dataframe.**

In [7]:
df1 = df1.withColumn("rank_v1", F.rank().over(runs_scored_by_team_window))

df1.show()

+---------------+------------+--------------+-----------+-------+-------+
|    player_name|   team_name|matches_played|runs_scored|row_num|rank_v1|
+---------------+------------+--------------+-----------+-------+-------+
|Rachin Ravindra|       India|            58|       1789|      1|      1|
|   Jason Sangha|       India|            19|        153|      2|      2|
|    Eoin Morgan|     Ireland|           215|       4015|      1|      1|
|      Tim David|   Singapore|            41|       1260|      1|      1|
|   Devon Conway|South Africa|            77|       2567|      1|      1|
|  Glenn Philips|South Africa|            48|       1126|      2|      2|
|     Sam Curran|    Zimbabwe|            97|       1673|      1|      1|
| Brandon Taylor|    Zimbabwe|            49|       1673|      2|      1|
|   Peter Clarke|    Zimbabwe|             3|         45|      3|      3|
+---------------+------------+--------------+-----------+-------+-------+



**8. Add a new column `rank_v2` that ranks players within their `team_name` based on `runs_scored` (no gaps in ranking)**

**9. Show the dataframe.**

In [8]:
df1 = df1.withColumn("rank_v2", F.dense_rank().over(runs_scored_by_team_window))

df1.show()

+---------------+------------+--------------+-----------+-------+-------+-------+
|    player_name|   team_name|matches_played|runs_scored|row_num|rank_v1|rank_v2|
+---------------+------------+--------------+-----------+-------+-------+-------+
|Rachin Ravindra|       India|            58|       1789|      1|      1|      1|
|   Jason Sangha|       India|            19|        153|      2|      2|      2|
|    Eoin Morgan|     Ireland|           215|       4015|      1|      1|      1|
|      Tim David|   Singapore|            41|       1260|      1|      1|      1|
|   Devon Conway|South Africa|            77|       2567|      1|      1|      1|
|  Glenn Philips|South Africa|            48|       1126|      2|      2|      2|
|     Sam Curran|    Zimbabwe|            97|       1673|      1|      1|      1|
| Brandon Taylor|    Zimbabwe|            49|       1673|      2|      1|      1|
|   Peter Clarke|    Zimbabwe|             3|         45|      3|      3|      2|
+---------------

**10. Compute the cumulative sum of `runs_scored` within each `team_name` ordered by `matches_played` and show the dataframe.**

In [9]:
cumulative_window = Window.partitionBy("team_name").orderBy("matches_played").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df1 = df1.withColumn("cumulative_sum", F.sum("runs_scored").over(cumulative_window))

df1.show()

+---------------+------------+--------------+-----------+-------+-------+-------+--------------+
|    player_name|   team_name|matches_played|runs_scored|row_num|rank_v1|rank_v2|cumulative_sum|
+---------------+------------+--------------+-----------+-------+-------+-------+--------------+
|   Jason Sangha|       India|            19|        153|      2|      2|      2|           153|
|Rachin Ravindra|       India|            58|       1789|      1|      1|      1|          1942|
|    Eoin Morgan|     Ireland|           215|       4015|      1|      1|      1|          4015|
|      Tim David|   Singapore|            41|       1260|      1|      1|      1|          1260|
|  Glenn Philips|South Africa|            48|       1126|      2|      2|      2|          1126|
|   Devon Conway|South Africa|            77|       2567|      1|      1|      1|          3693|
|   Peter Clarke|    Zimbabwe|             3|         45|      3|      3|      2|            45|
| Brandon Taylor|    Zimbabwe|

# EXERCISE 2: USER DEFINED FUNCTIONS (UDF)

**1. Create a DataFrame with specific columns and datatypes.**

**2. Insert records**

**3. Show dataframe.**

In [None]:
schema = StructType([
    StructField("player_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("role", StringType(), True),
    StructField("team_name", StringType(), True),
    StructField("runs_scored", IntegerType(), True)
])

data = [
    ("Sachin Tendulkar", 50, "Batsman", "India", 18426),
    ("Jacques Kallis", 48, "All-Rounder", "South Africa", 11579),
    ("Muttiah Muralitharan", 52, "Bowler", "Sri Lanka", 126),
    ("AB de Villiers", 39, "Batsman", "South Africa", 9577),
    ("Michael Hussey", 48, "Batsman", "Australia", 5442),
    ("Glenn McGrath", 54, "Bowler", "Australia", 12),
]

df2 = spark.createDataFrame(data, schema)
df2.show()

+--------------------+---+-----------+------------+-----------+
|         player_name|age|       role|   team_name|runs_scored|
+--------------------+---+-----------+------------+-----------+
|    Sachin Tendulkar| 50|    Batsman|       India|      18426|
|      Jacques Kallis| 48|All-Rounder|South Africa|      11579|
|Muttiah Muralitharan| 52|     Bowler|   Sri Lanka|        126|
|      AB de Villiers| 39|    Batsman|South Africa|       9577|
|      Michael Hussey| 48|    Batsman|   Australia|       5442|
|       Glenn McGrath| 54|     Bowler|   Australia|         12|
+--------------------+---+-----------+------------+-----------+



4. Create a **UDF** to classify players based on runs scored:
   - If `runs_scored` > 10,000 → "Great Batter"
   - If `runs_scored` between 5,000 and 10,000 → "Good Batter"
   - Otherwise → "Not a good Batter yet"

In [None]:
from pyspark.sql.functions import udf

def classify_players(runs):
  if runs > 10000:
      return "Great Batter"
  elif 5000 <= runs <= 10000:
      return "Good Batter"
  else:
      return "Not a good Batter yet"

**5. Apply this UDF and display a new column `"category"`.**

**6. Show the dataframe**

In [None]:
classify_players_udf = udf(classify_players, StringType())
df2 = df2.withColumn("category", classify_players_udf("runs_scored"))

df2.show(truncate=False)

+--------------------+---+-----------+------------+-----------+---------------------+
|player_name         |age|role       |team_name   |runs_scored|category             |
+--------------------+---+-----------+------------+-----------+---------------------+
|Sachin Tendulkar    |50 |Batsman    |India       |18426      |Great Batter         |
|Jacques Kallis      |48 |All-Rounder|South Africa|11579      |Great Batter         |
|Muttiah Muralitharan|52 |Bowler     |Sri Lanka   |126        |Not a good Batter yet|
|AB de Villiers      |39 |Batsman    |South Africa|9577       |Good Batter          |
|Michael Hussey      |48 |Batsman    |Australia   |5442       |Good Batter          |
|Glenn McGrath       |54 |Bowler     |Australia   |12         |Not a good Batter yet|
+--------------------+---+-----------+------------+-----------+---------------------+



**7. Create a `UDF to extract the first name and last name` from `player_name` and add two new columns: `"first_name"` and `"last_name"`.**

**8. Display the dataframe.**

In [None]:
def get_first_name(name):
  return name.split(" ")[0]

def get_last_name(name):
  return name.split(" ")[-1]

first_name_udf = udf(get_first_name, StringType())
last_name_udf = udf(get_last_name, StringType())

df2 = df2.withColumn("first_name", first_name_udf("player_name"))
df2 = df2.withColumn("last_name", last_name_udf("player_name"))

df2.show(truncate=False)

+--------------------+---+-----------+------------+-----------+---------------------+----------+------------+
|player_name         |age|role       |team_name   |runs_scored|category             |first_name|last_name   |
+--------------------+---+-----------+------------+-----------+---------------------+----------+------------+
|Sachin Tendulkar    |50 |Batsman    |India       |18426      |Great Batter         |Sachin    |Tendulkar   |
|Jacques Kallis      |48 |All-Rounder|South Africa|11579      |Great Batter         |Jacques   |Kallis      |
|Muttiah Muralitharan|52 |Bowler     |Sri Lanka   |126        |Not a good Batter yet|Muttiah   |Muralitharan|
|AB de Villiers      |39 |Batsman    |South Africa|9577       |Good Batter          |AB        |Villiers    |
|Michael Hussey      |48 |Batsman    |Australia   |5442       |Good Batter          |Michael   |Hussey      |
|Glenn McGrath       |54 |Bowler     |Australia   |12         |Not a good Batter yet|Glenn     |McGrath     |
+---------

**9. Create a `UDF to check if a player is eligible for veteran status` (`age >= 45` should return `True`, otherwise `False`). Add a new column `"is_veteran"`. Display the dataframe**

In [None]:
from pyspark.sql.types import BooleanType

def is_veteran(age):
  return age >= 45

veteran_udf = udf(is_veteran, BooleanType())
df2 = df2.withColumn("is_veteran", veteran_udf("age"))

df2.show(truncate=False)

+--------------------+---+-----------+------------+-----------+---------------------+----------+------------+----------+
|player_name         |age|role       |team_name   |runs_scored|category             |first_name|last_name   |is_veteran|
+--------------------+---+-----------+------------+-----------+---------------------+----------+------------+----------+
|Sachin Tendulkar    |50 |Batsman    |India       |18426      |Great Batter         |Sachin    |Tendulkar   |true      |
|Jacques Kallis      |48 |All-Rounder|South Africa|11579      |Great Batter         |Jacques   |Kallis      |true      |
|Muttiah Muralitharan|52 |Bowler     |Sri Lanka   |126        |Not a good Batter yet|Muttiah   |Muralitharan|true      |
|AB de Villiers      |39 |Batsman    |South Africa|9577       |Good Batter          |AB        |Villiers    |false     |
|Michael Hussey      |48 |Batsman    |Australia   |5442       |Good Batter          |Michael   |Hussey      |true      |
|Glenn McGrath       |54 |Bowler

10. Create a **UDF to generate a full descriptive name**, combining `player_name`, `role`, and `team_name` in the format:  
   **"Sachin Tendulkar (Batsman From India)"**  
   Name this new column as `"player_description"`.

11. Display the Dataframe.

In [None]:
def player_description(name, role, team):
  return f"{name} ({role} From {team})"

desc_udf = udf(player_description, StringType())
df2 = df2.withColumn("player_description", desc_udf("player_name", "role", "team_name"))

df2.show(truncate=False)

+--------------------+---+-----------+------------+-----------+---------------------+----------+------------+----------+----------------------------------------------+
|player_name         |age|role       |team_name   |runs_scored|category             |first_name|last_name   |is_veteran|player_description                            |
+--------------------+---+-----------+------------+-----------+---------------------+----------+------------+----------+----------------------------------------------+
|Sachin Tendulkar    |50 |Batsman    |India       |18426      |Great Batter         |Sachin    |Tendulkar   |true      |Sachin Tendulkar (Batsman From India)         |
|Jacques Kallis      |48 |All-Rounder|South Africa|11579      |Great Batter         |Jacques   |Kallis      |true      |Jacques Kallis (All-Rounder From South Africa)|
|Muttiah Muralitharan|52 |Bowler     |Sri Lanka   |126        |Not a good Batter yet|Muttiah   |Muralitharan|true      |Muttiah Muralitharan (Bowler From Sri La