In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import transform, col, concat, lit

spark = (SparkSession.builder
         .appName("basic-transformations")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/04 16:28:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = (spark.read.format("json")
      .option("multiLine", "true")
      .load("../data/nobel_prizes.json"))

                                                                                

In [3]:
# Apply transform function to Numbers column
df_transformed = (
    df.select("category"
              , "overallMotivation"
              , "year"
              , "laureates"
              , transform(col("laureates"), lambda x: concat(x.firstname,lit(" "), x.surname))
              .alias("laureates_full_name")))

df_transformed.show()

                                                                                

+----------+--------------------+----+--------------------+--------------------+
|  category|   overallMotivation|year|           laureates| laureates_full_name|
+----------+--------------------+----+--------------------+--------------------+
| chemistry|                null|2022|[{Carolyn, 1015, ...|[Carolyn Bertozzi...|
| economics|                null|2022|[{Ben, 1021, "for...|[Ben Bernanke, Do...|
|literature|                null|2022|[{Annie, 1017, "f...|      [Annie Ernaux]|
|     peace|                null|2022|[{Ales, 1018, "Th...|[Ales Bialiatski ...|
|   physics|                null|2022|[{Alain, 1012, "f...|[Alain Aspect, nu...|
|  medicine|                null|2022|[{Svante, 1011, "...|      [Svante Pääbo]|
| chemistry|                null|2021|[{Benjamin, 1002,...|[Benjamin List, D...|
| economics|                null|2021|[{David, 1007, "f...|[David Card, Josh...|
|literature|                null|2021|[{Abdulrazak, 100...| [Abdulrazak Gurnah]|
|     peace|                

In [4]:
df_deduped = df.dropDuplicates(["category","overallMotivation", "year"])

df_deduped.show()

[Stage 2:>                                                          (0 + 1) / 1]

+---------+--------------------+-----------------+----+
| category|           laureates|overallMotivation|year|
+---------+--------------------+-----------------+----+
|chemistry|[{Jacobus H., 160...|             null|1901|
|chemistry|[{Emil, 161, "in ...|             null|1902|
|chemistry|[{Svante, 162, "i...|             null|1903|
|chemistry|[{Sir William, 16...|             null|1904|
|chemistry|[{Adolf, 164, "in...|             null|1905|
|chemistry|[{Henri, 165, "in...|             null|1906|
|chemistry|[{Eduard, 166, "f...|             null|1907|
|chemistry|[{Ernest, 167, "f...|             null|1908|
|chemistry|[{Wilhelm, 168, "...|             null|1909|
|chemistry|[{Otto, 169, "in ...|             null|1910|
|chemistry|[{Marie, 6, "in r...|             null|1911|
|chemistry|[{Victor, 172, "f...|             null|1912|
|chemistry|[{Alfred, 174, "i...|             null|1913|
|chemistry|[{Theodore W., 17...|             null|1914|
|chemistry|[{Richard, 176, "...|             nul

                                                                                

In [5]:
# Sort by year in ascending order
df_sorted = df.orderBy("year")

df_sorted.show()

[Stage 5:>                                                          (0 + 1) / 1]

+----------+--------------------+-----------------+----+
|  category|           laureates|overallMotivation|year|
+----------+--------------------+-----------------+----+
| chemistry|[{Jacobus H., 160...|             null|1901|
|literature|[{Sully, 569, "in...|             null|1901|
|     peace|[{Henry, 462, "fo...|             null|1901|
|   physics|[{Wilhelm Conrad,...|             null|1901|
|  medicine|[{Emil, 293, "for...|             null|1901|
| chemistry|[{Emil, 161, "in ...|             null|1902|
|literature|[{Theodor, 571, "...|             null|1902|
|     peace|[{Élie, 464, "for...|             null|1902|
|   physics|[{Hendrik A., 2, ...|             null|1902|
|  medicine|[{Ronald, 294, "f...|             null|1902|
|literature|[{Bjørnstjerne, 5...|             null|1903|
| chemistry|[{Svante, 162, "i...|             null|1903|
|     peace|[{Randal, 466, "f...|             null|1903|
|   physics|[{Henri, 4, "in r...|             null|1903|
|  medicine|[{Niels Ryberg, 2..

                                                                                

In [6]:
# Sort by year in descending order, then by category in ascending order
df_sorted = df.orderBy(["year", "category"], ascending=[False, True])

df_sorted.show()

+----------+--------------------+--------------------+----+
|  category|           laureates|   overallMotivation|year|
+----------+--------------------+--------------------+----+
| chemistry|[{Carolyn, 1015, ...|                null|2022|
| economics|[{Ben, 1021, "for...|                null|2022|
|literature|[{Annie, 1017, "f...|                null|2022|
|  medicine|[{Svante, 1011, "...|                null|2022|
|     peace|[{Ales, 1018, "Th...|                null|2022|
|   physics|[{Alain, 1012, "f...|                null|2022|
| chemistry|[{Benjamin, 1002,...|                null|2021|
| economics|[{David, 1007, "f...|                null|2021|
|literature|[{Abdulrazak, 100...|                null|2021|
|  medicine|[{David, 997, "fo...|                null|2021|
|     peace|[{Maria, 1005, "f...|                null|2021|
|   physics|[{Syukuro, 999, "...|"for groundbreaki...|2021|
| chemistry|[{Emmanuelle, 991...|                null|2020|
| economics|[{Paul, 995, "for...|       

In [7]:
# Sort by Age in ascending order, then by Name in descending order
df_sorted = df.sort(["year", "category"], ascending=[False, True])

df_sorted.show()

+----------+--------------------+--------------------+----+
|  category|           laureates|   overallMotivation|year|
+----------+--------------------+--------------------+----+
| chemistry|[{Carolyn, 1015, ...|                null|2022|
| economics|[{Ben, 1021, "for...|                null|2022|
|literature|[{Annie, 1017, "f...|                null|2022|
|  medicine|[{Svante, 1011, "...|                null|2022|
|     peace|[{Ales, 1018, "Th...|                null|2022|
|   physics|[{Alain, 1012, "f...|                null|2022|
| chemistry|[{Benjamin, 1002,...|                null|2021|
| economics|[{David, 1007, "f...|                null|2021|
|literature|[{Abdulrazak, 100...|                null|2021|
|  medicine|[{David, 997, "fo...|                null|2021|
|     peace|[{Maria, 1005, "f...|                null|2021|
|   physics|[{Syukuro, 999, "...|"for groundbreaki...|2021|
| chemistry|[{Emmanuelle, 991...|                null|2020|
| economics|[{Paul, 995, "for...|       

In [8]:
df_renamed = df.withColumnRenamed("category", "Topic")

df_renamed.show()

+----------+--------------------+--------------------+----+
|     Topic|           laureates|   overallMotivation|year|
+----------+--------------------+--------------------+----+
| chemistry|[{Carolyn, 1015, ...|                null|2022|
| economics|[{Ben, 1021, "for...|                null|2022|
|literature|[{Annie, 1017, "f...|                null|2022|
|     peace|[{Ales, 1018, "Th...|                null|2022|
|   physics|[{Alain, 1012, "f...|                null|2022|
|  medicine|[{Svante, 1011, "...|                null|2022|
| chemistry|[{Benjamin, 1002,...|                null|2021|
| economics|[{David, 1007, "f...|                null|2021|
|literature|[{Abdulrazak, 100...|                null|2021|
|     peace|[{Maria, 1005, "f...|                null|2021|
|   physics|[{Syukuro, 999, "...|"for groundbreaki...|2021|
|  medicine|[{David, 997, "fo...|                null|2021|
| chemistry|[{Emmanuelle, 991...|                null|2020|
| economics|[{Paul, 995, "for...|       

In [9]:
df_renamed = (
    df.selectExpr("category as Topic"
                  , "year as Year"
                  , "overallMotivation as Motivation"))

df_renamed.show()

+----------+----+--------------------+
|     Topic|Year|          Motivation|
+----------+----+--------------------+
| chemistry|2022|                null|
| economics|2022|                null|
|literature|2022|                null|
|     peace|2022|                null|
|   physics|2022|                null|
|  medicine|2022|                null|
| chemistry|2021|                null|
| economics|2021|                null|
|literature|2021|                null|
|     peace|2021|                null|
|   physics|2021|"for groundbreaki...|
|  medicine|2021|                null|
| chemistry|2020|                null|
| economics|2020|                null|
|literature|2020|                null|
|     peace|2020|                null|
|   physics|2020|                null|
|  medicine|2020|                null|
| chemistry|2019|                null|
| economics|2019|                null|
+----------+----+--------------------+
only showing top 20 rows



In [10]:
spark.stop()