In [0]:
# 1. simple method 

data = [
    (100, "Sandeep", 20000),
    (101, "Deepak", 30000),
    (102, "Vishnu", 14000),
    (103, "Farooq", 40000),
    (104, "Gireesh", 34000),
]

columns = ["emp_id", "emp_name", "salary"]

df = spark.createDataFrame(data, columns)
df.show()

In [0]:
display(df)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [0]:
# 2. Proper Schema

data1 = [
    (100, "Sandeep", 20000),
    (101, "Deepak", 30000),
    (102, "Vishnu", 14000),
    (103, "Farooq", 40000),
    (104, "Gireesh", 34000),
]

schema1 = T.StructType([
    T.StructField("emp_id", T.IntegerType(), False),
    T.StructField("emp_name", T.StringType(), True),
    T.StructField("salary", T.IntegerType(), True),
])

df = spark.createDataFrame(data=data1, schema=schema1)
display(df)

Select Statements

In [0]:
# 1st method
display(
  df
  .select("emp_name", "salary")
)

In [0]:
# 2nd method
display(
  df
  .select(df.emp_name, df.salary)
)

In [0]:
# 3rd method
display(
  df
  .select(df["emp_name"], df["salary"])
)

In [0]:
# 4th method
display(
  df
  .select(
      F.col("emp_name"),
      F.col("salary")
    )
)

Filter/Where

In [0]:
df_filtered = (
    df
    .filter(F.col("salary") >= 30000)
)
df_filtered.display()

In [0]:
df_filtered = (
    df
    .where(F.col("salary") >= 30000)
)
df_filtered.display()

withColumn | withColumnRenamed

In [0]:
df_salary = (
    df
    .withColumn("is_active", F.lit(True))
)
df_salary.display()

In [0]:
df_salary = (
    df
    .withColumnRenamed("salary", "emp_salary")
)
df_salary.display()

In [0]:
df_salary = (
    df
    .withColumn("is_active", F.lit(True))
    .withColumn("bonus", F.col("salary") * 0.1)
    .withColumn("TotalSalary", F.col("salary") + F.col("bonus"))
)
df_salary.display()

In [0]:
df_salary.columns

In [0]:
df_salary.schema

In [0]:
df_salary.printSchema()

In [0]:
df_salary.describe().display()