In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
spark = SparkSession.builder.appName("DataFrameManipulation").getOrCreate()

# Sample DataFrame
data = [("Sachin", "V", 22, "Engineer"),
        ("Chaithu", "Kumar", 21, "Doctor"),
        ("Jeevan", "Kumar", 23, "Teacher")]
columns = ["first_name", "last_name", "age", "profession"]
df = spark.createDataFrame(data, columns)

df.show()

+----------+---------+---+----------+
|first_name|last_name|age|profession|
+----------+---------+---+----------+
|    Sachin|        V| 22|  Engineer|
|   Chaithu|    Kumar| 21|    Doctor|
|    Jeevan|    Kumar| 23|   Teacher|
+----------+---------+---+----------+



In [0]:
Manipulating

In [0]:
df = df.withColumn("city", lit("Chennai"))
df.show()

+----------+---------+---+----------+-------+
|first_name|last_name|age|profession|   city|
+----------+---------+---+----------+-------+
|    Sachin|        V| 22|  Engineer|Chennai|
|   Chaithu|    Kumar| 21|    Doctor|Chennai|
|    Jeevan|    Kumar| 23|   Teacher|Chennai|
+----------+---------+---+----------+-------+



In [0]:
Dropping

In [0]:
df = df.drop("city")
df.show()

+----------+---------+---+----------+
|first_name|last_name|age|profession|
+----------+---------+---+----------+
|    Sachin|        V| 22|  Engineer|
|   Chaithu|    Kumar| 21|    Doctor|
|    Jeevan|    Kumar| 23|   Teacher|
+----------+---------+---+----------+



In [0]:
Sorting

In [0]:
df_sorted = df.orderBy("age")
df_sorted.show()

+----------+---------+---+----------+
|first_name|last_name|age|profession|
+----------+---------+---+----------+
|   Chaithu|    Kumar| 21|    Doctor|
|    Sachin|        V| 22|  Engineer|
|    Jeevan|    Kumar| 23|   Teacher|
+----------+---------+---+----------+



In [0]:
Aggregating

In [0]:
from pyspark.sql.functions import avg
avg_df = df.select("avg(age)").collect()[0][0] 
print(avg_df)

In [0]:
Grouping

In [0]:
grouped_df = df.groupBy("profession").count()
grouped_df.show()

+----------+-----+
|profession|count|
+----------+-----+
|  Engineer|    1|
|    Doctor|    1|
|   Teacher|    1|
+----------+-----+



In [0]:
Joining

In [0]:
# Joining - Sample DataFrame for joining
data2 = [("Engineer", "Male"),
         ("Doctor", "Female"),
         ("Teacher", "Male")]
columns2 = ["profession", "gender"]
df2 = spark.createDataFrame(data2, columns2)
df2.show()

+----------+------+
|profession|gender|
+----------+------+
|  Engineer|  Male|
|    Doctor|Female|
|   Teacher|  Male|
+----------+------+



In [0]:
joined_df = df.join(df2, on="profession", how="inner")
joined_df.show()

+----------+----------+---------+---+------+
|profession|first_name|last_name|age|gender|
+----------+----------+---------+---+------+
|    Doctor|   Chaithu|    Kumar| 21|Female|
|  Engineer|    Sachin|        V| 22|  Male|
|   Teacher|    Jeevan|    Kumar| 23|  Male|
+----------+----------+---------+---+------+

