In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum

In [2]:
spark = SparkSession.builder.appName("Data manipulation").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/08 16:50:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/08 16:50:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/08 16:50:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/01/08 16:50:14 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/01/08 16:50:14 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
data = [(1, "A", 10), (2, "B", 20), (3, "A", 30), (4, "C", 40)]
columns = ["id", "category", "value"]
df = spark.createDataFrame(data, columns)


# Filtering rows where value is greater than 15 

In [4]:
filtered_df = df.filter(col("value") > 15)

# Grouping by category and aggregating with sum and average

In [5]:
agg_df = df.groupBy("category").agg(sum("value").alias("total_value"), avg("value").alias("avg_value"))

filtered_df.show()
agg_df.show()

                                                                                

+---+--------+-----+
| id|category|value|
+---+--------+-----+
|  2|       B|   20|
|  3|       A|   30|
|  4|       C|   40|
+---+--------+-----+

+--------+-----------+---------+
|category|total_value|avg_value|
+--------+-----------+---------+
|       A|         40|     20.0|
|       B|         20|     20.0|
|       C|         40|     40.0|
+--------+-----------+---------+



# JOINS

In [6]:
data1 = [(1, "A"), (2, "B"), (3, "C")]
data2 = [(1, "Apple"), (2, "Banana"), (4, "Durian")]

df1 = spark.createDataFrame(data1, ["id", "category"])
df2 = spark.createDataFrame(data2, ["id", "fruit"])

In [7]:
df1.show()

+---+--------+
| id|category|
+---+--------+
|  1|       A|
|  2|       B|
|  3|       C|
+---+--------+



In [8]:
df2.show()

+---+------+
| id| fruit|
+---+------+
|  1| Apple|
|  2|Banana|
|  4|Durian|
+---+------+



In [9]:
inner_join_df = df1.join(df2, on="id", how="inner")

In [10]:
left_join_df = df1.join(df2, on="id", how="left")

In [11]:
union_df = df1.union(df2) 

inner_join_df.show()
left_join_df.show()
union_df.show()

+---+--------+------+
| id|category| fruit|
+---+--------+------+
|  1|       A| Apple|
|  2|       B|Banana|
+---+--------+------+

+---+--------+------+
| id|category| fruit|
+---+--------+------+
|  1|       A| Apple|
|  2|       B|Banana|
|  3|       C|  NULL|
+---+--------+------+

+---+--------+
| id|category|
+---+--------+
|  1|       A|
|  2|       B|
|  3|       C|
|  1|   Apple|
|  2|  Banana|
|  4|  Durian|
+---+--------+



# Window Functions

In [12]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, rank, sum, dense_rank

In [13]:
data = [(1, "A", 100), (2, "A", 200), (3, "B", 300), (4, "B", 400)]
df = spark.createDataFrame(data, ["id", "category", "value"])

In [14]:
window_spec = Window.partitionBy("category").orderBy("value")
df_with_row_number = df.withColumn("row_number", row_number().over(window_spec))
df_with_rank = df.withColumn("rank", rank().over(window_spec))
df_with_running_total = df.withColumn("running_total", sum("value").over(window_spec))
df_with_dense_rank = df.withColumn("dense_rank", dense_rank().over(window_spec))
df_with_row_number.show()
df_with_rank.show()
df_with_running_total.show()
df_with_dense_rank.show()

+---+--------+-----+----------+
| id|category|value|row_number|
+---+--------+-----+----------+
|  1|       A|  100|         1|
|  2|       A|  200|         2|
|  3|       B|  300|         1|
|  4|       B|  400|         2|
+---+--------+-----+----------+

+---+--------+-----+----+
| id|category|value|rank|
+---+--------+-----+----+
|  1|       A|  100|   1|
|  2|       A|  200|   2|
|  3|       B|  300|   1|
|  4|       B|  400|   2|
+---+--------+-----+----+

+---+--------+-----+-------------+
| id|category|value|running_total|
+---+--------+-----+-------------+
|  1|       A|  100|          100|
|  2|       A|  200|          300|
|  3|       B|  300|          300|
|  4|       B|  400|          700|
+---+--------+-----+-------------+

+---+--------+-----+----------+
| id|category|value|dense_rank|
+---+--------+-----+----------+
|  1|       A|  100|         1|
|  2|       A|  200|         2|
|  3|       B|  300|         1|
|  4|       B|  400|         2|
+---+--------+-----+---------

25/01/08 16:50:26 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
