In [0]:
from pyspark.sql.functions import concat_ws, col, udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


In [0]:
deliveries_df = spark.read.format("csv").option("header", "true").load("/FileStore/tables/deliveries.csv")
deliveries_df.show(5)


+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|        batting_team|        bowling_team|over|ball|     batter| bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   1| SC Ganguly|P Kumar|BB McCullum|           0|         1|         1|    legbyes|        0|              NA|            NA|     NA|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   2|BB McCullum|P Kumar| SC Ganguly|           0|         0|         0|       null|        0|              NA|            NA|     NA|
|  33

In [0]:
deliveries_with_commentary = deliveries_df.withColumn(
    "commentary",
    concat_ws(" ", col("bowler"), col("batter"), col("total_runs"))
)
deliveries_with_commentary.select("bowler", "batter", "total_runs", "commentary").show(5, truncate=False)


+-------+-----------+----------+---------------------+
|bowler |batter     |total_runs|commentary           |
+-------+-----------+----------+---------------------+
|P Kumar|SC Ganguly |1         |P Kumar SC Ganguly 1 |
|P Kumar|BB McCullum|0         |P Kumar BB McCullum 0|
|P Kumar|BB McCullum|1         |P Kumar BB McCullum 1|
|P Kumar|BB McCullum|0         |P Kumar BB McCullum 0|
|P Kumar|BB McCullum|0         |P Kumar BB McCullum 0|
+-------+-----------+----------+---------------------+
only showing top 5 rows



In [0]:
def extract_info(commentary):
    parts = commentary.split()
    if len(parts) >= 3:
        bowler = parts[0]
        batsman = parts[1]
        runs = parts[2]
        if runs.isdigit():
            return bowler, batsman, int(runs)
        else:
            return bowler, batsman, 0  
    else:
        return None, None, None


In [0]:
deliveries_df.show(5)


+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|match_id|inning|        batting_team|        bowling_team|over|ball|     batter| bowler|non_striker|batsman_runs|extra_runs|total_runs|extras_type|is_wicket|player_dismissed|dismissal_kind|fielder|
+--------+------+--------------------+--------------------+----+----+-----------+-------+-----------+------------+----------+----------+-----------+---------+----------------+--------------+-------+
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   1| SC Ganguly|P Kumar|BB McCullum|           0|         1|         1|    legbyes|        0|              NA|            NA|     NA|
|  335982|     1|Kolkata Knight Ri...|Royal Challengers...|   0|   2|BB McCullum|P Kumar| SC Ganguly|           0|         0|         0|       null|        0|              NA|            NA|     NA|
|  33

In [0]:
deliveries_with_commentary = deliveries_df.withColumn(
    "commentary",
    concat_ws(" ", col("bowler"), col("batter"), col("total_runs"))
)


In [0]:
schema = StructType([
    StructField("Bowler", StringType(), True),
    StructField("Batsman", StringType(), True),
    StructField("Runs", IntegerType(), True)
])


In [0]:
extract_info_udf = udf(extract_info, schema)


In [0]:
extracted_df = deliveries_with_commentary.withColumn("extracted", extract_info_udf(col("commentary")))
extracted_df.select("commentary", "extracted").show(truncate=False)


+-----------------------+---------------+
|commentary             |extracted      |
+-----------------------+---------------+
|P Kumar SC Ganguly 1   |{P, Kumar, 0}  |
|P Kumar BB McCullum 0  |{P, Kumar, 0}  |
|P Kumar BB McCullum 1  |{P, Kumar, 0}  |
|P Kumar BB McCullum 0  |{P, Kumar, 0}  |
|P Kumar BB McCullum 0  |{P, Kumar, 0}  |
|P Kumar BB McCullum 0  |{P, Kumar, 0}  |
|P Kumar BB McCullum 1  |{P, Kumar, 0}  |
|Z Khan BB McCullum 0   |{Z, Khan, 0}   |
|Z Khan BB McCullum 4   |{Z, Khan, 0}   |
|Z Khan BB McCullum 4   |{Z, Khan, 0}   |
|Z Khan BB McCullum 6   |{Z, Khan, 0}   |
|Z Khan BB McCullum 4   |{Z, Khan, 0}   |
|Z Khan BB McCullum 0   |{Z, Khan, 0}   |
|P Kumar SC Ganguly 0   |{P, Kumar, 0}  |
|P Kumar SC Ganguly 0   |{P, Kumar, 0}  |
|P Kumar SC Ganguly 1   |{P, Kumar, 0}  |
|P Kumar BB McCullum 4  |{P, Kumar, 0}  |
|P Kumar BB McCullum 1  |{P, Kumar, 0}  |
|P Kumar SC Ganguly 0   |{P, Kumar, 0}  |
|AA Noffke BB McCullum 5|{AA, Noffke, 0}|
+-----------------------+---------