In [1]:
!pip install pyspark



In [2]:
!pip install -q findspark

In [3]:
import findspark
findspark.init()

In [4]:
# creating a spark session that will be used to perform all the necessary task on Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('df-operations-exercise').getOrCreate()
spark

# EXERCISE 1

**1. Create a DataFrame with four columns: `Name`, `Age`, `City` and `Country`.**

**2. Insert the following five records:**

In [7]:
# data for the dataframe
data = [
    ("Mark Boucher", 42, "Cape Town", "South Africa"),
    ("Shaun Pollock", 48, "Durban", "South Africa"),
    ("Brendon McCullam", 39, "Wellington", "New ZeaLand"),
    ("Saurav Ganguly", 44, "Mumbai", "India"),
    ("Shoaib Akhtar", 41, "Rawalpindi", "Pakistan")
]

# Creating the DataFrame
ex1_df = spark.createDataFrame(data, ["Name", "Age", "City", "Country"])

**3. Display the DataFrame.**

In [8]:
ex1_df.show()

+----------------+---+----------+------------+
|            Name|Age|      City|     Country|
+----------------+---+----------+------------+
|    Mark Boucher| 42| Cape Town|South Africa|
|   Shaun Pollock| 48|    Durban|South Africa|
|Brendon McCullam| 39|Wellington| New ZeaLand|
|  Saurav Ganguly| 44|    Mumbai|       India|
|   Shoaib Akhtar| 41|Rawalpindi|    Pakistan|
+----------------+---+----------+------------+



**4. Show the dataframe with only the following columns: `Name` and `Country`.**

In [9]:
ex1_df.select("Name", "Country").show()

+----------------+------------+
|            Name|     Country|
+----------------+------------+
|    Mark Boucher|South Africa|
|   Shaun Pollock|South Africa|
|Brendon McCullam| New ZeaLand|
|  Saurav Ganguly|       India|
|   Shoaib Akhtar|    Pakistan|
+----------------+------------+



**5. Show the dataframe sorted by the column `Age` in *descending* order.**

In [12]:
from pyspark.sql.functions import col
ex1_df.orderBy(col("Age").desc()).show()

+----------------+---+----------+------------+
|            Name|Age|      City|     Country|
+----------------+---+----------+------------+
|   Shaun Pollock| 48|    Durban|South Africa|
|  Saurav Ganguly| 44|    Mumbai|       India|
|    Mark Boucher| 42| Cape Town|South Africa|
|   Shoaib Akhtar| 41|Rawalpindi|    Pakistan|
|Brendon McCullam| 39|Wellington| New ZeaLand|
+----------------+---+----------+------------+



In [13]:
ex1_df.sort("Age", ascending=False).show()

+----------------+---+----------+------------+
|            Name|Age|      City|     Country|
+----------------+---+----------+------------+
|   Shaun Pollock| 48|    Durban|South Africa|
|  Saurav Ganguly| 44|    Mumbai|       India|
|    Mark Boucher| 42| Cape Town|South Africa|
|   Shoaib Akhtar| 41|Rawalpindi|    Pakistan|
|Brendon McCullam| 39|Wellington| New ZeaLand|
+----------------+---+----------+------------+



**6. Show the columns available in the dataframe.**

In [14]:
ex1_df.columns

['Name', 'Age', 'City', 'Country']

**7. Drop the column `Age` from the Dataframe.**

In [18]:
ex1_df = ex1_df.drop("Age")

**8. Show the schema of the dataframe as of now.**

In [19]:
ex1_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)



**9. Show the records from the dataframe where the `Country` is South Africa.**

In [22]:
ex1_df.filter(ex1_df.Country == "South Africa").show()

+-------------+---------+------------+
|         Name|     City|     Country|
+-------------+---------+------------+
| Mark Boucher|Cape Town|South Africa|
|Shaun Pollock|   Durban|South Africa|
+-------------+---------+------------+



# EXERCISE 2

1. Read the csv file and create its dataframe with following schema
   - `first_name` (StringType)
   - `last_name` (StringType)
   - `gender` (StringType)
   - `age` (IntegerType)

2. Display the DataFrame.


In [25]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Defining the schema
schema = StructType([
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("age", IntegerType(), True)
])

ex2_df = spark.read.csv("players1.csv", header=True, schema=schema)

ex2_df.show()

+----------+---------+------+---+
|first_name|last_name|gender|age|
+----------+---------+------+---+
|      Ross|   Taylor|     M| 34|
|     Clint|    McKay|     M| 37|
|      Kyle|    Mills|     M| 38|
|       Meg|  Lanning|     F| 33|
|     Suzie|    Bates|     F| 35|
|      Todd|    Astle|     M| 43|
|     Chloe|    Tryon|     F| 31|
|  Samantha|    Bates|     F| 29|
|     Glenn|  Maxwell|     M| 33|
|   Deandra|   Dottin|     F| 37|
+----------+---------+------+---+



**3. Show only the first five records of the DataFrame.**

In [28]:
ex2_df.show(5)

+----------+---------+------+---+
|first_name|last_name|gender|age|
+----------+---------+------+---+
|      Ross|   Taylor|     M| 34|
|     Clint|    McKay|     M| 37|
|      Kyle|    Mills|     M| 38|
|       Meg|  Lanning|     F| 33|
|     Suzie|    Bates|     F| 35|
+----------+---------+------+---+
only showing top 5 rows



**4. Show the number of counts of each `gender`.**

In [33]:
from pyspark.sql.functions import count

gender_group_count = ex2_df.groupBy("gender").agg(count("*").alias("Cnt")).show()

+------+---+
|gender|Cnt|
+------+---+
|     F|  5|
|     M|  5|
+------+---+



**5. Show the `first_name`, `last_name` and the `age` of the player who is the youngest in the whole dataframe.**

In [35]:
from pyspark.sql.functions import min
youngest_player = ex2_df.orderBy("Age").select("first_name", "last_name", "Age").limit(1).show()

+----------+---------+---+
|first_name|last_name|Age|
+----------+---------+---+
|  Samantha|    Bates| 29|
+----------+---------+---+



In [42]:
# method 2

min_age = ex2_df.select(min("age")).collect()[0][0]
ex2_df.filter(ex2_df.age == min_age).select("first_name", "last_name", "Age").show()

+----------+---------+---+
|first_name|last_name|Age|
+----------+---------+---+
|  Samantha|    Bates| 29|
+----------+---------+---+



**6. Show the `first_name`, `last_name`, `gender` and the `age` of the player who is the oldest among their gender in the whole dataframe.**

In [64]:
from pyspark.sql.functions import max

oldest_by_gender = ex2_df.groupBy("gender").agg(max("age").alias("max_age"))
oldest_players = ex2_df.join(oldest_by_gender.drop("gender"), (ex2_df.gender == oldest_by_gender.gender) & (ex2_df.age == oldest_by_gender.max_age)) \
                       .select(ex2_df.first_name, ex2_df.last_name, ex2_df.gender, ex2_df.age)
# oldest_player.columns
oldest_players.show()

+----------+---------+------+---+
|first_name|last_name|gender|age|
+----------+---------+------+---+
|     Clint|    McKay|     M| 37|
|      Todd|    Astle|     M| 43|
|   Deandra|   Dottin|     F| 37|
+----------+---------+------+---+



In [43]:
# method 2

from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# Defining a window partitioned by gender, ordered by age descending
window_spec = Window.partitionBy("gender").orderBy(col("age").desc())

# Ranking players within each gender by age
df_ranked = ex2_df.withColumn("rank", rank().over(window_spec))

# Select the players with rank 1 (oldest)
oldest_players = df_ranked.filter(col("rank") == 1).select("first_name", "last_name", "gender", "Age")
oldest_players.show()


+----------+---------+---+
|first_name|last_name|Age|
+----------+---------+---+
|      Todd|    Astle| 43|
+----------+---------+---+



**7. Add another column `full_name` which would be the concatenation of `first_name` and `last_name` in the dataframe.**

In [38]:
from pyspark.sql.functions import concat_ws
ex2_df = ex2_df.withColumn("full_name", concat_ws(" ", col("first_name"), col("last_name")))

ex2_df.show()

+----------+---------+------+---+--------------+
|first_name|last_name|gender|age|     full_name|
+----------+---------+------+---+--------------+
|      Ross|   Taylor|     M| 34|   Ross Taylor|
|     Clint|    McKay|     M| 37|   Clint McKay|
|      Kyle|    Mills|     M| 38|    Kyle Mills|
|       Meg|  Lanning|     F| 33|   Meg Lanning|
|     Suzie|    Bates|     F| 35|   Suzie Bates|
|      Todd|    Astle|     M| 43|    Todd Astle|
|     Chloe|    Tryon|     F| 31|   Chloe Tryon|
|  Samantha|    Bates|     F| 29|Samantha Bates|
|     Glenn|  Maxwell|     M| 33| Glenn Maxwell|
|   Deandra|   Dottin|     F| 37|Deandra Dottin|
+----------+---------+------+---+--------------+



In [65]:
from pyspark.sql.functions import expr

df = ex2_df.withColumn("full_name", expr("first_name || ' ' || last_name"))
df.show()

+----------+---------+------+---+--------------+
|first_name|last_name|gender|age|     full_name|
+----------+---------+------+---+--------------+
|      Ross|   Taylor|     M| 34|   Ross Taylor|
|     Clint|    McKay|     M| 37|   Clint McKay|
|      Kyle|    Mills|     M| 38|    Kyle Mills|
|       Meg|  Lanning|     F| 33|   Meg Lanning|
|     Suzie|    Bates|     F| 35|   Suzie Bates|
|      Todd|    Astle|     M| 43|    Todd Astle|
|     Chloe|    Tryon|     F| 31|   Chloe Tryon|
|  Samantha|    Bates|     F| 29|Samantha Bates|
|     Glenn|  Maxwell|     M| 33| Glenn Maxwell|
|   Deandra|   Dottin|     F| 37|Deandra Dottin|
+----------+---------+------+---+--------------+



**8. Show the schema of the dataframe.**

In [39]:
ex2_df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- full_name: string (nullable = false)



**9. Show the records for those who have `age` greater than 35.**

In [41]:
ex2_df.where(ex2_df.age > 35).show()

+----------+---------+------+---+--------------+
|first_name|last_name|gender|age|     full_name|
+----------+---------+------+---+--------------+
|     Clint|    McKay|     M| 37|   Clint McKay|
|      Kyle|    Mills|     M| 38|    Kyle Mills|
|      Todd|    Astle|     M| 43|    Todd Astle|
|   Deandra|   Dottin|     F| 37|Deandra Dottin|
+----------+---------+------+---+--------------+

