In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DiveInDataframes").master("local[*]").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/04 11:01:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Why use manual schema?

- When schema inference is expensive on large files.

- To enforce correct types (prevent errors).

- To ensure consistent schema across multiple reads.

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("marks", DoubleType(), True),
    StructField("city", StringType(), True)
])

manual_df = spark.read.csv("students.csv", header=True, schema=schema)
manual_df.printSchema()


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- marks: double (nullable = true)
 |-- city: string (nullable = true)



In [18]:
print(manual_df.show(3))
print(manual_df.printSchema())
print(manual_df.describe().show())


+---+-----+-----+----+
| id| name|marks|city|
+---+-----+-----+----+
|  1|Aisha| NULL|  90|
|  2|  Raj| NULL|  80|
|  3| Neha| NULL|  85|
+---+-----+-----+----+
only showing top 3 rows
None
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- marks: double (nullable = true)
 |-- city: string (nullable = true)

None
+-------+------------------+-----+-----+-----------------+
|summary|                id| name|marks|             city|
+-------+------------------+-----+-----+-----------------+
|  count|                 5|    5|    0|                5|
|   mean|               3.0| NULL| NULL|             82.6|
| stddev|1.5811388300841898| NULL| NULL|7.987490219086343|
|    min|                 1|Aisha| NULL|               70|
|    max|                 5|  Raj| NULL|               90|
+-------+------------------+-----+-----+-----------------+

None


25/11/03 16:59:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: ID, Name, Subject, Score
 Schema: id, name, marks, city
Expected: marks but found: Subject
CSV file: file:///home/developer/Workspace_Projects/Data_Engineer/PySpark/students.csv
25/11/03 16:59:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: ID, Name, Subject, Score
 Schema: id, name, marks, city
Expected: marks but found: Subject
CSV file: file:///home/developer/Workspace_Projects/Data_Engineer/PySpark/students.csv


In [3]:

data = [("John", 23, "NY"),
        ("Alice", 29, "LA"),
        ("Robert", 34, "SF")]

columns = ["Name", "Age", "City"]

df = spark.createDataFrame(data, columns)
df.show()

                                                                                

+------+---+----+
|  Name|Age|City|
+------+---+----+
|  John| 23|  NY|
| Alice| 29|  LA|
|Robert| 34|  SF|
+------+---+----+



In [20]:
df.select(df["Name"].alias("FullName")).show()


+--------+
|FullName|
+--------+
|    John|
|   Alice|
|  Robert|
+--------+



In [21]:
df.filter(df.Age > 25).show()
df.where(df.City == "NY").show()
df.filter(df.City.isin("LA", "SF")).show()
df.filter(df.Age.between(20, 30)).show()


+------+---+----+
|  Name|Age|City|
+------+---+----+
| Alice| 29|  LA|
|Robert| 34|  SF|
+------+---+----+

+----+---+----+
|Name|Age|City|
+----+---+----+
|John| 23|  NY|
+----+---+----+

+------+---+----+
|  Name|Age|City|
+------+---+----+
| Alice| 29|  LA|
|Robert| 34|  SF|
+------+---+----+

+-----+---+----+
| Name|Age|City|
+-----+---+----+
| John| 23|  NY|
|Alice| 29|  LA|
+-----+---+----+



In [22]:
from pyspark.sql.functions import when

df4 = df.withColumn("Category", when(df.Age < 25, "Young").otherwise("Adult"))
df4.show()


+------+---+----+--------+
|  Name|Age|City|Category|
+------+---+----+--------+
|  John| 23|  NY|   Young|
| Alice| 29|  LA|   Adult|
|Robert| 34|  SF|   Adult|
+------+---+----+--------+



In [24]:
df.na.replace("NY", "New York").show()


+------+---+--------+
|  Name|Age|    City|
+------+---+--------+
|  John| 23|New York|
| Alice| 29|      LA|
|Robert| 34|      SF|
+------+---+--------+



In [25]:
rdd_from_df = df.rdd
print(rdd_from_df.take(2))


[Row(Name='John', Age=23, City='NY'), Row(Name='Alice', Age=29, City='LA')]


In [4]:
from pyspark.sql.functions import sum, avg, min, max, count

df = spark.read.parquet("modern_employee_data.parquet", header=True, inferSchema=True)

grouped_df = df.groupBy("Country").agg(
    count("*").alias("TotalStudents"),
    avg("Age").alias("AvgAge"),
    max("Age").alias("Oldest"),
    min("Age").alias("Youngest")
)
grouped_df.show()

+---------+-------------+-----------------+------+--------+
|  Country|TotalStudents|           AvgAge|Oldest|Youngest|
+---------+-------------+-----------------+------+--------+
|    India|          155|40.66451612903226|    60|      21|
|      USA|          153|40.22875816993464|    60|      21|
|       UK|          154|41.16233766233766|    60|      21|
|Australia|          138|40.88405797101449|    60|      21|
+---------+-------------+-----------------+------+--------+



In [31]:
df.show(5)

+-----------+-------------------+------+---+-------+------------------+----------+------+------------+----------------+-----------------+--------------------+--------------+
|employee_id|               name|gender|age|country|             state|department|salary|joining_date|experience_years|performance_score|               email|  phone_number|
+-----------+-------------------+------+---+-------+------------------+----------+------+------------+----------------+-----------------+--------------------+--------------+
|          1|      Katrina Riley|  Male| 51|     UK|South Allisonmouth| Marketing| 97018|  2020-04-30|               5|             3.88|   becky87@gmail.com|  326-034-8112|
|          2|         Divij Raja| Other| 60|     UK|          Bhilwara| Marketing| 83808|  2016-05-25|               2|             4.57|lagan10@krishnan-...|   07045303968|
|          3|        Ivana Divan|  Male| 29|  India|         Rajasthan| Marketing|160911|  2019-10-24|              11|           

In [33]:
df.groupBy("Country").count().show()

+---------+-----+
|  Country|count|
+---------+-----+
|    India|  155|
|      USA|  153|
|       UK|  154|
|Australia|  138|
+---------+-----+



In [None]:
from pyspark.sql.functions import avg
df.groupBy("department").agg(avg("salary").alias("Avg_Salary")).show()

+----------+------------------+
|department|        Avg_Salary|
+----------+------------------+
|     Sales|140049.72727272726|
|        HR| 141130.7882352941|
|   Finance|153565.27472527474|
|     Admin|141213.59259259258|
| Marketing| 152743.7590361446|
|        IT|149318.83529411766|
|Operations|146407.54022988505|
+----------+------------------+



In [48]:
from pyspark.sql.functions import desc

df.groupBy("country", "department").agg(
    avg("salary").alias("avg_salary"),
    count("*").alias("emp_count")
).orderBy(desc("emp_count")).show(24)

+---------+----------+------------------+---------+
|  country|department|        avg_salary|emp_count|
+---------+----------+------------------+---------+
|      USA|   Finance|172351.62068965516|       29|
|    India|        HR|145319.33333333334|       27|
|Australia|     Admin|129202.76923076923|       26|
|    India|     Sales|131644.76923076922|       26|
|    India|        IT|143953.26923076922|       26|
|       UK|Operations|148373.04166666666|       24|
|       UK| Marketing|150033.16666666666|       24|
|    India|   Finance|150180.08333333334|       24|
|      USA|     Sales|145252.70833333334|       24|
|       UK|        IT|137518.78260869565|       23|
|       UK|     Sales|128299.82608695653|       23|
|      USA| Marketing|156002.36363636365|       22|
|       UK|     Admin|155517.86363636365|       22|
|       UK|   Finance|146697.19047619047|       21|
|Australia| Marketing|147776.09523809524|       21|
|    India|Operations|166568.47619047618|       21|
|Australia|O

In [57]:
from pyspark.sql.functions import expr

df.groupBy("department").agg(
    expr(("round(avg(salary) * 1.1, 2)")).alias("avg_salary_with_bonus")
).show()

+----------+---------------------+
|department|avg_salary_with_bonus|
+----------+---------------------+
|     Sales|             154054.7|
|        HR|            155243.87|
|   Finance|             168921.8|
|     Admin|            155334.95|
| Marketing|            168018.13|
|        IT|            164250.72|
|Operations|            161048.29|
+----------+---------------------+



In [60]:
from pyspark.sql.functions import countDistinct, approx_count_distinct

df.agg(
    countDistinct("Country").alias("distinct_countries"),
    approx_count_distinct("email").alias("approx_unique_emails")
).show()

+------------------+--------------------+
|distinct_countries|approx_unique_emails|
+------------------+--------------------+
|                 4|                 589|
+------------------+--------------------+



In [68]:
#  Goal: Subtotals and grand totals.

df.rollup("Country", "department").agg(
    sum("salary").alias("total_salary")
).orderBy("Country", "department", ascending= False).show()

+-------+----------+------------+
|Country|department|total_salary|
+-------+----------+------------+
|    USA|     Sales|     3486065|
|    USA|Operations|     2850033|
|    USA| Marketing|     3432052|
|    USA|        IT|     2777077|
|    USA|        HR|     3033527|
|    USA|   Finance|     4998197|
|    USA|     Admin|     2273681|
|    USA|      NULL|    22850632|
|     UK|     Sales|     2950896|
|     UK|Operations|     3560953|
|     UK| Marketing|     3600796|
|     UK|        IT|     3162932|
|     UK|        HR|     2311845|
|     UK|   Finance|     3080641|
|     UK|     Admin|     3421393|
|     UK|      NULL|    22089456|
|  India|     Sales|     3422764|
|  India|Operations|     3497938|
|  India| Marketing|     2541586|
|  India|        IT|     3742785|
+-------+----------+------------+
only showing top 20 rows


In [69]:
df.cube("country", "department").agg(sum("salary").alias("total_salary")).show()

+---------+----------+------------+
|  country|department|total_salary|
+---------+----------+------------+
|     NULL| Marketing|    12677732|
|    India|     Admin|     2383955|
|     NULL|   Finance|    13974440|
|    India|     Sales|     3422764|
|       UK|        HR|     2311845|
|    India|        HR|     3923622|
|       UK|Operations|     3560953|
|    India|        IT|     3742785|
|     NULL|     Admin|    11438301|
|Australia| Marketing|     3103298|
|      USA|Operations|     2850033|
|       UK| Marketing|     3600796|
|     NULL|     Sales|    12324376|
|     NULL|Operations|    12737456|
|       UK|        IT|     3162932|
|    India|   Finance|     3604322|
|     NULL|        HR|    11996117|
|     NULL|      NULL|    87840523|
|Australia|     Sales|     2464651|
|      USA|   Finance|     4998197|
+---------+----------+------------+
only showing top 20 rows


In [73]:
pivot_df = df.groupBy("country").pivot("department").agg(max("salary")).alias("Highest_Salary").show()

+---------+------+-------+------+------+---------+----------+------+
|  country| Admin|Finance|    HR|    IT|Marketing|Operations| Sales|
+---------+------+-------+------+------+---------+----------+------+
|    India|243129| 246545|245801|247341|   235095|    243716|244013|
|      USA|214719| 245062|233484|242102|   248181|    239365|245754|
|       UK|249191| 244811|247480|243858|   248213|    247156|249185|
|Australia|241577| 249375|243805|247517|   244619|    249484|244045|
+---------+------+-------+------+------+---------+----------+------+



In [74]:
results = df.groupBy("department").agg(avg("salary").alias("avg_salary")).collect()

for row in results:
    print(row["department"], "→", row["avg_salary"])


Sales → 140049.72727272726
HR → 141130.7882352941
Finance → 153565.27472527474
Admin → 141213.59259259258
Marketing → 152743.7590361446
IT → 149318.83529411766
Operations → 146407.54022988505


In [75]:
print(df.groupBy("country").agg(avg("salary")).explain(True))


== Parsed Logical Plan ==
'Aggregate ['country], ['country, unresolvedalias('avg('salary))]
+- Relation [employee_id#1951,name#1952,gender#1953,age#1954,country#1955,state#1956,department#1957,salary#1958,joining_date#1959,experience_years#1960,performance_score#1961,email#1962,phone_number#1963] parquet

== Analyzed Logical Plan ==
country: string, avg(salary): double
Aggregate [country#1955], [country#1955, avg(salary#1958) AS avg(salary)#5898]
+- Relation [employee_id#1951,name#1952,gender#1953,age#1954,country#1955,state#1956,department#1957,salary#1958,joining_date#1959,experience_years#1960,performance_score#1961,email#1962,phone_number#1963] parquet

== Optimized Logical Plan ==
Aggregate [country#1955], [country#1955, avg(salary#1958) AS avg(salary)#5898]
+- Project [country#1955, salary#1958]
   +- Relation [employee_id#1951,name#1952,gender#1953,age#1954,country#1955,state#1956,department#1957,salary#1958,joining_date#1959,experience_years#1960,performance_score#1961,email#19

In [54]:
import textwrap

plan_string = df._jdf.queryExecution().executedPlan().toString()

# Wrap the string to a specific width (e.g., 80 characters)
wrapped_plan = textwrap.fill(plan_string, width=60)

print(wrapped_plan)


*(1) ColumnarToRow +- FileScan parquet [employee_id#17,name#
18,gender#19,age#20,country#21,state#22,department#23,salary
#24,joining_date#25,experience_years#26,performance_score#27
,email#28,phone_number#29] Batched: true, DataFilters: [],
Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/
home/developer/Workspace_Projects/Data_Engineer/PySpark/mode
rn_e..., PartitionFilters: [], PushedFilters: [],
ReadSchema: struct<employee_id:int,name:string,gender:string
,age:int,country:string,state:string,department:s...


In [29]:
spark = SparkSession.builder.appName("JoinsExample").getOrCreate()

emp_df = spark.read.parquet("modern_employee_data.parquet")
dept_df = spark.read.csv("department_budget.csv", header=True, inferSchema=True)

emp_df.printSchema()
dept_df.show()

root
 |-- employee_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- joining_date: date (nullable = true)
 |-- experience_years: integer (nullable = true)
 |-- performance_score: double (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_number: string (nullable = true)

+----------+------+-------------+
|Department|Budget|      Manager|
+----------+------+-------------+
| Marketing|500000|  Sarah Miles|
|        HR|200000|  John Carter|
|        IT|700000|   David Wong|
|   Finance|400000| Priya Sharma|
|     Sales|350000|  Luke Martin|
|Operations|450000|Emma Thompson|
|     Admin|250000| Chris Parker|
+----------+------+-------------+



In [28]:
inner_join = emp_df.join(dept_df, "department", "inner")
inner_join.select("name", "salary", "department", "budget").show(10)

+-------------------+------+----------+------+
|               name|salary|department|budget|
+-------------------+------+----------+------+
|      Katrina Riley| 97018| Marketing|500000|
|         Divij Raja| 83808| Marketing|500000|
|        Ivana Divan|160911| Marketing|500000|
|Miss Margaret Lucas|244675|        HR|200000|
| Ms Kathleen Turner|114263|     Sales|350000|
| Divyansh Sabharwal|194621|        IT|700000|
|         Levi Mason|153641| Marketing|500000|
|    Dr Terry Knight|246545|   Finance|400000|
| Brittney Daugherty|202992|Operations|450000|
|       Taylor Smith| 58293|        IT|700000|
+-------------------+------+----------+------+
only showing top 10 rows


In [36]:
left_join = emp_df.join(dept_df, "department", "left")
left_join.select("name", "department", "budget").show(10)


+-------------------+----------+------+
|               name|department|budget|
+-------------------+----------+------+
|      Katrina Riley| Marketing|500000|
|         Divij Raja| Marketing|500000|
|        Ivana Divan| Marketing|500000|
|Miss Margaret Lucas|        HR|200000|
| Ms Kathleen Turner|     Sales|350000|
| Divyansh Sabharwal|        IT|700000|
|         Levi Mason| Marketing|500000|
|    Dr Terry Knight|   Finance|400000|
| Brittney Daugherty|Operations|450000|
|       Taylor Smith|        IT|700000|
+-------------------+----------+------+
only showing top 10 rows


In [None]:
# Only keeps employees whose department exists in the dept file
semi_join = emp_df.join(dept_df, "department", "left_semi")
semi_join.show(6)

+----------+-----------+-------------------+------+---+-------+------------------+------+------------+----------------+-----------------+--------------------+--------------+
|department|employee_id|               name|gender|age|country|             state|salary|joining_date|experience_years|performance_score|               email|  phone_number|
+----------+-----------+-------------------+------+---+-------+------------------+------+------------+----------------+-----------------+--------------------+--------------+
| Marketing|          1|      Katrina Riley|  Male| 51|     UK|South Allisonmouth| 97018|  2020-04-30|               5|             3.88|   becky87@gmail.com|  326-034-8112|
| Marketing|          2|         Divij Raja| Other| 60|     UK|          Bhilwara| 83808|  2016-05-25|               2|             4.57|lagan10@krishnan-...|   07045303968|
| Marketing|          3|        Ivana Divan|  Male| 29|  India|         Rajasthan|160911|  2019-10-24|              11|           

In [39]:
anti_join = emp_df.join(dept_df, "department", "left_anti")
anti_join.show()


+----------+-----------+----+------+---+-------+-----+------+------------+----------------+-----------------+-----+------------+
|department|employee_id|name|gender|age|country|state|salary|joining_date|experience_years|performance_score|email|phone_number|
+----------+-----------+----+------+---+-------+-----+------+------------+----------------+-----------------+-----+------------+
+----------+-----------+----+------+---+-------+-----+------+------------+----------------+-----------------+-----+------------+



In [None]:
cross_join = emp_df.crossJoin(dept_df)
cross_join.show(10)


+-----------+-------------+------+---+-------+------------------+----------+------+------------+----------------+-----------------+--------------------+------------+----------+------+-------------+
|employee_id|         name|gender|age|country|             state|department|salary|joining_date|experience_years|performance_score|               email|phone_number|Department|Budget|      Manager|
+-----------+-------------+------+---+-------+------------------+----------+------+------------+----------------+-----------------+--------------------+------------+----------+------+-------------+
|          1|Katrina Riley|  Male| 51|     UK|South Allisonmouth| Marketing| 97018|  2020-04-30|               5|             3.88|   becky87@gmail.com|326-034-8112| Marketing|500000|  Sarah Miles|
|          1|Katrina Riley|  Male| 51|     UK|South Allisonmouth| Marketing| 97018|  2020-04-30|               5|             3.88|   becky87@gmail.com|326-034-8112|        HR|200000|  John Carter|
|         

In [50]:
from pyspark.sql import functions as F 

combined_df = emp_df.join(dept_df, "department", "inner")

dept_analysis = combined_df.groupBy("department") \
    .agg(
        F.sum("salary").alias("total_salary"),
        F.first("budget").alias("budget")
    ) 

dept_analysis = dept_analysis.withColumn(
    "budget_remaining", F.col("budget") - F.col("total_salary")
)

dept_analysis.show()


+----------+------------+--------+----------------+
|department|total_salary|  budget|budget_remaining|
+----------+------------+--------+----------------+
|     Sales|    12324376|35000000|        22675624|
|        HR|    11996117|20000000|         8003883|
|   Finance|    13974440|40000000|        26025560|
|     Admin|    11438301|25000000|        13561699|
| Marketing|    12677732|50000000|        37322268|
|        IT|    12692101|70000000|        57307899|
|Operations|    12737456|45000000|        32262544|
+----------+------------+--------+----------------+



In [51]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, dense_rank, row_number, avg, sum, lead, lag


In [53]:
salary_range = Window.partitionBy("department").orderBy(col("salary").desc())

ranked_df = emp_df.withColumn("rank", rank().over(salary_range))\
                  .withColumn("dense_rank", dense_rank().over(salary_range))\
                  .withColumn("row_number", row_number().over(salary_range))

ranked_df.select("name", "department", "salary", "rank", "dense_rank", "row_number").show(20)

+------------------+----------+------+----+----------+----------+
|              name|department|salary|rank|dense_rank|row_number|
+------------------+----------+------+----+----------+----------+
|      Nishith Kade|     Admin|249191|   1|         1|         1|
|   Rebecca Collins|     Admin|244091|   2|         2|         2|
|     Biju Aggarwal|     Admin|243782|   3|         3|         3|
|    Kayleigh Patel|     Admin|243129|   4|         4|         4|
|        Vidur Jani|     Admin|241577|   5|         5|         5|
|      William Ware|     Admin|240562|   6|         6|         6|
|    Pihu Zachariah|     Admin|236161|   7|         7|         7|
|    Stephen Howard|     Admin|235240|   8|         8|         8|
|      Paul Jenkins|     Admin|235228|   9|         9|         9|
|     Hollie Wilson|     Admin|228204|  10|        10|        10|
| Robert Richardson|     Admin|222452|  11|        11|        11|
|      Seher Chahal|     Admin|218583|  12|        12|        12|
| Bethan W