In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

In [0]:
emp_df = spark.read.format("csv") \
                   .option("header","True") \
                   .option("inferSchema","True") \
                   .load("/FileStore/tables/Emp_pysp_details-1.csv")

emp_df.show()

emp_df.printSchema()

emp_df.createOrReplaceTempView("employee_table")

# ALIASING

In [0]:
emp_df.select(col("id").alias("employee_id"),"name",emp_df["age"]).show()

# FILTERING

In [0]:
## Salary > 1.5 lakh
emp_df.filter(col("salary") > 150000).show()

In [0]:
## Salary > 1.5 lakh, age < 18
emp_df.filter((col("salary") > 150000) & (col("age") < 18)).show()

# WHERE

In [0]:
## Salary > 1.5 lakh
emp_df.where(col("salary") > 150000).show()

# LITERAL

In [0]:
emp_df.select("*", lit("kumar").alias("last_name")).show()

# ADDING COLUMNS

In [0]:
emp_df.withColumn("Surname",lit("Singh")).show()

# RENAME COLUMN

In [0]:
emp_df.withColumnRenamed("id","employee_id").show()

# CASTING DATATYPE

In [0]:
emp_df.printSchema()

In [0]:
emp_df.withColumn("id",col("id").cast("string")) \
      .withColumn("salary",col("salary").cast("long")).printSchema()

# REMOVE COLUMNS

In [0]:
emp_df.drop(col("id")).show()

# Spark SQL

In [0]:
%sql
select id as emp_id,name,age,salary from employee_table

In [0]:
%sql
select * from employee_table where salary > 150000 and age < 18

In [0]:
%sql
select *,"Singh" as SurName, concat(name,' ',SurName) as Full_Name from employee_table

In [0]:
%sql
select cast(id as string) from employee_table