In [0]:
from pyspark.sql import functions as F, types as T

In [0]:
data = [
    ("James", "Sales", 3000), 
    ("Michael", "Sales", 4600), 
    ("Robert", "Sales", 4100), 
    ("Maria", "Finance", 3000), 
    ("James", "Sales", 3000), 
    ("James", "Sales", 4000),
    ("Scott", "Finance", 3300), 
    ("Jen", "Finance", 3900), 
    ("Jeff", "Marketing", 3000), 
    ("Kumar", "Marketing", 2000), 
    ("Saif", "Sales", 4100) 
  ]

columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.display()

distinct

In [0]:
distinct_df = df.distinct()
distinct_df.display()

dropDuplicates

In [0]:
distinct_df = df.dropDuplicates(subset = ["employee_name", "department"])
distinct_df.display()

In [0]:
distinct_df = df.dropDuplicates()
distinct_df.display()

orderBy | SortBy

In [0]:
display(
    df.sort(F.col("salary").asc())
)

In [0]:
display(
    df.sort(F.col("salary").desc())
)

In [0]:
display(
    df.orderBy(F.col("salary").asc())
)

In [0]:
display(
    df.orderBy(F.col("salary").desc())
)

groupBy

In [0]:
total_sal_df = (
    df
    .groupBy("department")
    .sum("salary")
)
total_sal_df.display()

In [0]:
# .alias("total_salary_of_dep")

total_sal_df = (
    df
    .groupBy("department")
    .agg(
        F.sum("salary").alias("total_salary_of_dep"),
        F.avg("salary").alias("avg_salary_of_dep")
    )
)
total_sal_df.display()

joins

In [0]:
# tableA tableB
# 1 1
# 1 2
# 1 3
# 2 4
# 3 Null
# Null 1

In [0]:
# 2 == 2

In [0]:
# null == null --> False

In [0]:
# # left join
# 1 1
# 1 1
# 1 1
# 1 1
# 1 1
# 1 1
# 2 2
# 3 3
# null null

In [0]:
# # inner Join
# 1 1
# 1 1
# 1 1
# 1 1
# 1 1
# 1 1
# 2 2
# 3 3

In [0]:
emp = [(1,"Smith",-1,"2018","10","M",3000), 
    (2,"Rose",1,"2010","20","M",4000), 
    (3,"Williams",1,"2010","10","M",1000), 
    (4,"Jones",2,"2005","10","F",2000), 
    (5,"Brown",2,"2010","40","",-1), 
      (6,"Brown",2,"2010","50","",-1) 
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", 
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)

dept = [("Finance",10), 
    ("Marketing",20), 
    ("Sales",30), 
    ("IT",40) 
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)

In [0]:
display(empDF)

In [0]:
deptDF.display()

In [0]:
dept_df = (
    empDF
    .select("emp_id", F.col("name").alias("emp_name"), F.col("emp_dept_id").alias("dept_id"))
    .join(
        deptDF,
        on=["dept_id"],
        how="left"
    )
    .select("emp_id", "emp_name", "dept_name")
)

dept_df.display()

In [0]:
dept_df = (
    empDF
    .select("emp_id", F.col("name").alias("employee_name"), F.col("emp_dept_id"), F.col("salary"))
    .join(
        deptDF,
        empDF["emp_dept_id"] == deptDF.dept_id,
        how="left"
    )
    .select("employee_name", F.col("dept_name").alias("department_name"), F.col("salary"))
)

dept_df.display()

In [0]:
df.display()

In [0]:
df = df.withColumn("salary", F.col("salary").cast(T.StringType()))

In [0]:
df = df.withColumnRenamed("department", "dept")

union | unionByName

In [0]:
union_df = (
    df
    .union(dept_df)
)
union_df.display()

In [0]:
dept_df = (
    empDF
    .select("emp_id", F.col("name").alias("employee_name"), F.col("emp_dept_id"), F.col("salary"))
    .join(
        deptDF,
        empDF["emp_dept_id"] == deptDF.dept_id,
        how="left"
    )
    .select("employee_name", F.col("emp_dept_id").alias("dept_id"), F.col("dept_name").alias("department_name"), F.col("salary"))
)

dept_df.display()

In [0]:
df.printSchema()

In [0]:
df = df.withColumnRenamed("dept", "department_name")
df.printSchema()

In [0]:
dept_df.printSchema()

In [0]:
union_df = (
    df.unionByName(dept_df, allowMissingColumns=True)
)
union_df.display()

In [0]:
union_df = (
    df.select("salary", "employee_name", "dept")
    .union(dept_df)
)
union_df.display()