In [0]:
employee_df=spark.read.format("csv")\
            .option("header","true")\
            .option("inferschema","true")\
            .option("mode","PERMISSIVE")\
            .load("/FileStore/tables/employee_file.csv")

In [0]:
employee_df.show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
employee_df.select("name").show()

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
employee_df.select(col("name")).show()


+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [0]:
employee_df.select(col("id") +5 ).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_df.select(col("id"),col("name"),col("age")).show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [0]:
employee_df.select("id",col("name"),employee_df["salary"],employee_df.address).show()

+---+--------+------+------------+
| id|    name|salary|     address|
+---+--------+------+------------+
|  1|  Manish| 75000|       bihar|
|  2|  Nikita|100000|uttarpradesh|
|  3|  Pritam|150000|   Bangalore|
|  4|Prantosh|200000|     Kolkata|
|  5|  Vikash|300000|        null|
+---+--------+------+------------+



expression

In [0]:
employee_df.select(expr("id +5")).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



In [0]:
employee_df.select(expr("id as employee_id"),expr("name as emploee_name"),expr("concat(name,address)")).show()

+-----------+------------+---------------------+
|employee_id|emploee_name|concat(name, address)|
+-----------+------------+---------------------+
|          1|      Manish|          Manishbihar|
|          2|      Nikita|   Nikitauttarpradesh|
|          3|      Pritam|      PritamBangalore|
|          4|    Prantosh|      PrantoshKolkata|
|          5|      Vikash|                 null|
+-----------+------------+---------------------+



Spark SQL

In [0]:
employee_df.createOrReplaceTempView("employee_tbl")

In [0]:
spark.sql("""

    select * from employee_tbl      
          
          """).show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
employee_df.select("*").show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [0]:
spark.sql("""
          select id,name,age from employee_tbl
          """).show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



Aliasing on id

In [0]:
employee_df.select(col("id").alias("employee_id"),"name","age").show()

+-----------+--------+---+
|employee_id|    name|age|
+-----------+--------+---+
|          1|  Manish| 26|
|          2|  Nikita| 23|
|          3|  Pritam| 22|
|          4|Prantosh| 17|
|          5|  Vikash| 31|
+-----------+--------+---+



Filter / Where      to find out whose salary is greater than 1.5 lakh

In [0]:
employee_df.filter(col("salary")>150000).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [0]:
employee_df.where(col("salary")>150000).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [0]:
employee_df.filter((col("salary")>150000) & (col("age")<18)).show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
+---+--------+---+------+-------+-------+



In [0]:
employee_df.select("*", lit("kumar").alias("last_name")).show()

+---+--------+---+------+------------+--------+---------+
| id|    name|age|salary|     address| nominee|last_name|
+---+--------+---+------+------------+--------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|    kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|    kumar|
|  5|  Vikash| 31|300000|        null|nominee5|    kumar|
+---+--------+---+------+------------+--------+---------+



In [0]:
employee_df.withColumn("surname",lit("shinde")).show()

+---+--------+---+------+------------+--------+-------+
| id|    name|age|salary|     address| nominee|surname|
+---+--------+---+------+------------+--------+-------+
|  1|  Manish| 26| 75000|       bihar|nominee1| shinde|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2| shinde|
|  3|  Pritam| 22|150000|   Bangalore|   India| shinde|
|  4|Prantosh| 17|200000|     Kolkata|   India| shinde|
|  5|  Vikash| 31|300000|        null|nominee5| shinde|
+---+--------+---+------+------------+--------+-------+



In [0]:
employee_df.withColumnRenamed("id","emp_id").show()

+------+--------+---+------+------------+--------+
|emp_id|    name|age|salary|     address| nominee|
+------+--------+---+------+------------+--------+
|     1|  Manish| 26| 75000|       bihar|nominee1|
|     2|  Nikita| 23|100000|uttarpradesh|nominee2|
|     3|  Pritam| 22|150000|   Bangalore|   India|
|     4|Prantosh| 17|200000|     Kolkata|   India|
|     5|  Vikash| 31|300000|        null|nominee5|
+------+--------+---+------+------------+--------+



In [0]:
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
employee_df.withColumn("id",col("id").cast("string"))\
           .withColumn("salary",col("salary").cast("long"))\
    .printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
employee_df.drop("id",col("name")).show()

+---+------+------------+--------+
|age|salary|     address| nominee|
+---+------+------------+--------+
| 26| 75000|       bihar|nominee1|
| 23|100000|uttarpradesh|nominee2|
| 22|150000|   Bangalore|   India|
| 17|200000|     Kolkata|   India|
| 31|300000|        null|nominee5|
+---+------+------------+--------+



Spark SQL

In [0]:
spark.sql("""
   select *,"kumar" as last_name from employee_tbl where salary>15000 and age<18       
          """).show()

+---+--------+---+------+-------+-------+---------+
| id|    name|age|salary|address|nominee|last_name|
+---+--------+---+------+-------+-------+---------+
|  4|Prantosh| 17|200000|Kolkata|  India|    kumar|
+---+--------+---+------+-------+-------+---------+



In [0]:
spark.sql("""
   select *,"kumar" as last_name, concat(name,last_name) as full_name from employee_tbl where salary>15000 and age<18       
          """).show()

+---+--------+---+------+-------+-------+---------+-------------+
| id|    name|age|salary|address|nominee|last_name|    full_name|
+---+--------+---+------+-------+-------+---------+-------------+
|  4|Prantosh| 17|200000|Kolkata|  India|    kumar|Prantoshkumar|
+---+--------+---+------+-------+-------+---------+-------------+



In [0]:
spark.sql("""
   select *,"kumar" as last_name, concat(name,last_name) as full_name,cast(id as string) from employee_tbl where salary>15000 and age<18       
          """).printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- last_name: string (nullable = false)
 |-- full_name: string (nullable = true)
 |-- id: string (nullable = true)

