In [1]:
!pip install pyspark



In [2]:
!pip install -q findspark

In [3]:
import findspark
findspark.init()

In [4]:
# creating a spark session that will be used to perform all the necessary task on Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName('df-agg-operations').getOrCreate()
spark

# EXERCISE 1

**1. Create a DataFrame with specific columns and datatypes.**

**2. Insert records**

**3. Show dataframe.**

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Defining the schema
schema = StructType([
    StructField("player_name", StringType(), True),
    StructField("matches", IntegerType(), True),
    StructField("goals_scored", IntegerType(), True),
    StructField("team_name", StringType(), True)
])

# data for the dataframe
data = [
   ("Christiano Ronaldo", 7, 12, "Spain"),
   ("Lionel Messi", 6, 11, "Argentica"),
   ("Luka Modric", 5, 9, "Croatia"),
   ("Harry Kane", 5, 12, "England"),
   ("Vinicius Junior", 4, 7, "Brazil"),
   ("Sergio Ramos", 7, 5, "Spain"),
   ("Neymar Da Silva", 4, 9, "Brazil")
]

# Creating the DataFrame
df1 = spark.createDataFrame(data, schema=schema)

df1.show()

+------------------+-------+------------+---------+
|       player_name|matches|goals_scored|team_name|
+------------------+-------+------------+---------+
|Christiano Ronaldo|      7|          12|    Spain|
|      Lionel Messi|      6|          11|Argentica|
|       Luka Modric|      5|           9|  Croatia|
|        Harry Kane|      5|          12|  England|
|   Vinicius Junior|      4|           7|   Brazil|
|      Sergio Ramos|      7|           5|    Spain|
|   Neymar Da Silva|      4|           9|   Brazil|
+------------------+-------+------------+---------+



**4. Show the **number of records** in the dataframe.**

In [9]:
df1.count()

7

**5. Show the dataframe with only the following columns: `player_name` and `goals_scored`.**

In [12]:
df1.select("player_name", "goals_scored").show()

+------------------+------------+
|       player_name|goals_scored|
+------------------+------------+
|Christiano Ronaldo|          12|
|      Lionel Messi|          11|
|       Luka Modric|           9|
|        Harry Kane|          12|
|   Vinicius Junior|           7|
|      Sergio Ramos|           5|
|   Neymar Da Silva|           9|
+------------------+------------+



**6. Show the dataframe `sorted` by the column `goals_scored` in *descending* order.**

In [13]:
df1.sort("goals_scored", ascending=False).show()

+------------------+-------+------------+---------+
|       player_name|matches|goals_scored|team_name|
+------------------+-------+------------+---------+
|Christiano Ronaldo|      7|          12|    Spain|
|        Harry Kane|      5|          12|  England|
|      Lionel Messi|      6|          11|Argentica|
|   Neymar Da Silva|      4|           9|   Brazil|
|       Luka Modric|      5|           9|  Croatia|
|   Vinicius Junior|      4|           7|   Brazil|
|      Sergio Ramos|      7|           5|    Spain|
+------------------+-------+------------+---------+



**7. Show the total number of `goals_scored` in the dataframe.**

In [16]:
from pyspark.sql.functions import sum

total_goals = df1.select(sum(df1.goals_scored).alias("Total Goals"))
total_goals.show()

+-----------+
|Total Goals|
+-----------+
|         65|
+-----------+



**8. Show the `average` number of `goals_scored` in the dataframe.**

In [17]:
from pyspark.sql.functions import avg

avg_goals = df1.select(avg(df1.goals_scored).alias("Average Goals"))
avg_goals.show()

+-----------------+
|    Average Goals|
+-----------------+
|9.285714285714286|
+-----------------+



**9. Show the records from the dataframe where the `team_name` is *Spain*.**

In [18]:
df1.filter(df1.team_name == "Spain").show()

+------------------+-------+------------+---------+
|       player_name|matches|goals_scored|team_name|
+------------------+-------+------------+---------+
|Christiano Ronaldo|      7|          12|    Spain|
|      Sergio Ramos|      7|           5|    Spain|
+------------------+-------+------------+---------+



**10. Show the result set from the dataframe with the columns as `Max Goals`, `Min Goals`, `Total Goals`, `Average Goals` for each `team_name`.**

In [20]:
from pyspark.sql.functions import min, max
teams_group = df1.groupby("team_name").agg(
    max("goals_scored").alias("Max Goals"),
    min("goals_scored").alias("Min Goals"),
    sum("goals_scored").alias("Total Goals"),
    avg("goals_scored").alias("Average Goals")
    )

teams_group.show()

+---------+---------+---------+-----------+-------------+
|team_name|Max Goals|Min Goals|Total Goals|Average Goals|
+---------+---------+---------+-----------+-------------+
|  Croatia|        9|        9|          9|          9.0|
|    Spain|       12|        5|         17|          8.5|
|Argentica|       11|       11|         11|         11.0|
|   Brazil|        9|        7|         16|          8.0|
|  England|       12|       12|         12|         12.0|
+---------+---------+---------+-----------+-------------+



**11. Show the `player_name` who scored the most goals, (Columns to show:`player_name`, `goals_scored`).**

In [23]:
max_goals = df1.select(max(df1.goals_scored)).collect()[0][0]

df1.filter(df1.goals_scored == max_goals).select("player_name", "goals_scored").show()

+------------------+------------+
|       player_name|goals_scored|
+------------------+------------+
|Christiano Ronaldo|          12|
|        Harry Kane|          12|
+------------------+------------+



**12. Show the `player_name` who played the most matches, (Columns to show:`player_name`, `matches`).**

In [24]:
most_matches = df1.select(max(df1.matches)).collect()[0][0]

df1.filter(df1.matches == most_matches).select("player_name", "matches").show()

+------------------+-------+
|       player_name|matches|
+------------------+-------+
|Christiano Ronaldo|      7|
|      Sergio Ramos|      7|
+------------------+-------+



In [25]:
total_matches = df1.groupBy("player_name").agg(sum("matches").alias("matches"))

max_match = total_matches.select(max(total_matches.matches)).collect()[0][0]

total_matches.filter(total_matches.matches == max_match).select("player_name", "matches").show()


+------------------+-------+
|       player_name|matches|
+------------------+-------+
|Christiano Ronaldo|      7|
|      Sergio Ramos|      7|
+------------------+-------+



# EXERCISE 2

**1. Read a csv (player2.csv) and create a dataframe with specific schema**

**2. Display the Dataframe**

In [42]:
# Defining the schema
schema = StructType([
    StructField("player_name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("country", StringType(), True)
])

df2 = spark.read.csv("players2.csv", header=True, schema=schema)

df2.show()

+---------------+---+------+------------+
|    player_name|age|gender|     country|
+---------------+---+------+------------+
|    Shaun Davis| 21|     M|       Spain|
|      Mark Howy| 22|     M|       Spain|
| Cole McConchie| 17|     M| New Zealand|
|   Dave Pollock| 16|     M|South Africa|
|  Main Gimmonds| 19|     M|     England|
|Fellicity Davis| 17|     M| New Zealand|
|      Zoe Stark| 22|     F|     England|
+---------------+---+------+------------+



**3. Show only the first five records of the DataFrame sorted by `age`.**

In [43]:
df2.sort("age", ascending=False).show(5)

+--------------+---+------+-----------+
|   player_name|age|gender|    country|
+--------------+---+------+-----------+
|     Mark Howy| 22|     M|      Spain|
|     Zoe Stark| 22|     F|    England|
|   Shaun Davis| 21|     M|      Spain|
| Main Gimmonds| 19|     M|    England|
|Cole McConchie| 17|     M|New Zealand|
+--------------+---+------+-----------+
only showing top 5 rows



**4. Derive two new fields `first_name` and `last_name` using `player_name`. (first_name and last_name is separated by space).**

**5. Display the DataFrame.**

In [44]:
from pyspark.sql.functions import split

new_df2_1 = df2.withColumn("first_name", split(df2["player_name"], " ").getItem(0)) \
              .withColumn("last_name", split(df2["player_name"], " ").getItem(1))

new_df2_1.show()

+---------------+---+------+------------+----------+---------+
|    player_name|age|gender|     country|first_name|last_name|
+---------------+---+------+------------+----------+---------+
|    Shaun Davis| 21|     M|       Spain|     Shaun|    Davis|
|      Mark Howy| 22|     M|       Spain|      Mark|     Howy|
| Cole McConchie| 17|     M| New Zealand|      Cole|McConchie|
|   Dave Pollock| 16|     M|South Africa|      Dave|  Pollock|
|  Main Gimmonds| 19|     M|     England|      Main| Gimmonds|
|Fellicity Davis| 17|     M| New Zealand| Fellicity|    Davis|
|      Zoe Stark| 22|     F|     England|       Zoe|    Stark|
+---------------+---+------+------------+----------+---------+



In [45]:
from pyspark.sql.functions import expr

new_df2_2 = df2.withColumn("first_name", expr("split(player_name, ' ')[0]")) \
              .withColumn("last_name", expr("split(player_name, ' ')[1]"))

new_df2_2.show()

+---------------+---+------+------------+----------+---------+
|    player_name|age|gender|     country|first_name|last_name|
+---------------+---+------+------------+----------+---------+
|    Shaun Davis| 21|     M|       Spain|     Shaun|    Davis|
|      Mark Howy| 22|     M|       Spain|      Mark|     Howy|
| Cole McConchie| 17|     M| New Zealand|      Cole|McConchie|
|   Dave Pollock| 16|     M|South Africa|      Dave|  Pollock|
|  Main Gimmonds| 19|     M|     England|      Main| Gimmonds|
|Fellicity Davis| 17|     M| New Zealand| Fellicity|    Davis|
|      Zoe Stark| 22|     F|     England|       Zoe|    Stark|
+---------------+---+------+------------+----------+---------+



**6. Show the schema of the dataframe.**

In [46]:
new_df2_1.printSchema()

root
 |-- player_name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)



**7. Derive a new field `Is_Eligible`. (Assign it as *True* if the age is less than 21, else *False*)**

In [47]:
from pyspark.sql.functions import col, when

new_df2_1 = new_df2_1.withColumn("Is_Eligible", when(col("age") < 21, True).otherwise(False))

new_df2_1.show()

+---------------+---+------+------------+----------+---------+-----------+
|    player_name|age|gender|     country|first_name|last_name|Is_Eligible|
+---------------+---+------+------------+----------+---------+-----------+
|    Shaun Davis| 21|     M|       Spain|     Shaun|    Davis|      false|
|      Mark Howy| 22|     M|       Spain|      Mark|     Howy|      false|
| Cole McConchie| 17|     M| New Zealand|      Cole|McConchie|       true|
|   Dave Pollock| 16|     M|South Africa|      Dave|  Pollock|       true|
|  Main Gimmonds| 19|     M|     England|      Main| Gimmonds|       true|
|Fellicity Davis| 17|     M| New Zealand| Fellicity|    Davis|       true|
|      Zoe Stark| 22|     F|     England|       Zoe|    Stark|      false|
+---------------+---+------+------------+----------+---------+-----------+



In [48]:
from pyspark.sql.functions import col, when

new_df2_2 = new_df2_2.withColumn("Is_Eligible", expr("age < 21"))

new_df2_2.show()

+---------------+---+------+------------+----------+---------+-----------+
|    player_name|age|gender|     country|first_name|last_name|Is_Eligible|
+---------------+---+------+------------+----------+---------+-----------+
|    Shaun Davis| 21|     M|       Spain|     Shaun|    Davis|      false|
|      Mark Howy| 22|     M|       Spain|      Mark|     Howy|      false|
| Cole McConchie| 17|     M| New Zealand|      Cole|McConchie|       true|
|   Dave Pollock| 16|     M|South Africa|      Dave|  Pollock|       true|
|  Main Gimmonds| 19|     M|     England|      Main| Gimmonds|       true|
|Fellicity Davis| 17|     M| New Zealand| Fellicity|    Davis|       true|
|      Zoe Stark| 22|     F|     England|       Zoe|    Stark|      false|
+---------------+---+------+------------+----------+---------+-----------+



**8. Show the records who has `Is_Eligible` as *True*.**

In [49]:
new_df2_2.filter(new_df2_2.Is_Eligible).show()

+---------------+---+------+------------+----------+---------+-----------+
|    player_name|age|gender|     country|first_name|last_name|Is_Eligible|
+---------------+---+------+------------+----------+---------+-----------+
| Cole McConchie| 17|     M| New Zealand|      Cole|McConchie|       true|
|   Dave Pollock| 16|     M|South Africa|      Dave|  Pollock|       true|
|  Main Gimmonds| 19|     M|     England|      Main| Gimmonds|       true|
|Fellicity Davis| 17|     M| New Zealand| Fellicity|    Davis|       true|
+---------------+---+------+------------+----------+---------+-----------+



**9. Show the records who has `Is_Eligible` as *False*.**

In [52]:
new_df2_2.filter(new_df2_2.Is_Eligible == False).show()

+-----------+---+------+-------+----------+---------+-----------+
|player_name|age|gender|country|first_name|last_name|Is_Eligible|
+-----------+---+------+-------+----------+---------+-----------+
|Shaun Davis| 21|     M|  Spain|     Shaun|    Davis|      false|
|  Mark Howy| 22|     M|  Spain|      Mark|     Howy|      false|
|  Zoe Stark| 22|     F|England|       Zoe|    Stark|      false|
+-----------+---+------+-------+----------+---------+-----------+

