In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
employee_df = spark.read.format("csv")\
                    .option("header","true")\
                    .option("nferschema","true")\
                    .option("mode","PERMISSIVE")\
                    .load("/FileStore/tables/employee_data-6.csv")
employee_df.show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  1|  Manish| 26| 75000|  INDIA|     m|
|  2|  Nikita| 23|100000|    USA|     f|
|  3|  Pritam| 22|150000|  INDIA|     m|
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
|  8| Praveen| 28| 70000|  JAPAN|     m|
|  9|     Dev| 32|150000|  JAPAN|     m|
| 10|  Sherin| 16| 25000| RUSSIA|     f|
| 11|    Ragu| 12| 35000|  INDIA|     f|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



In [0]:
employee_df.printSchema

Out[7]: <bound method DataFrame.printSchema of DataFrame[id: string, name: string, age: string, salary: string, address: string, gender: string]>

In [0]:
#Creatind Temporary view as temporry view is required in Spark SQL
employee_df.createOrReplaceTempView("employee_tbl")

In [0]:
#Aliasing employee_id in place of id
employee_df.select(col("id").alias("employee_id"),"name","age").show()

+-----------+--------+---+
|employee_id|    name|age|
+-----------+--------+---+
|          1|  Manish| 26|
|          2|  Nikita| 23|
|          3|  Pritam| 22|
|          4|Prantosh| 17|
|          5|  Vikash| 31|
|          6|   Rahul| 55|
|          7|    Raju| 67|
|          8| Praveen| 28|
|          9|     Dev| 32|
|         10|  Sherin| 16|
|         11|    Ragu| 12|
|         12|   Sweta| 43|
|         13| Raushan| 48|
|         14|  Mukesh| 36|
|         15| Prakash| 52|
+-----------+--------+---+



In [0]:
#Filter salary > 1.5 lakhs, can also use where in the same way
employee_df.filter(col("salary")> 150000).show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  4|Prantosh| 17|200000|  JAPAN|     m|
|  5|  Vikash| 31|300000|    USA|     m|
|  6|   Rahul| 55|300000|  INDIA|     m|
|  7|    Raju| 67|540000|    USA|     m|
| 12|   Sweta| 43|200000|  INDIA|     f|
| 13| Raushan| 48|650000|    USA|     m|
| 15| Prakash| 52|750000|  INDIA|     m|
+---+--------+---+------+-------+------+



In [0]:
#Using Literal to show Last_name 
employee_df.select("*", lit("Jere").alias("Last_name")).show()

+---+--------+---+------+-------+------+---------+
| id|    name|age|salary|address|gender|Last_name|
+---+--------+---+------+-------+------+---------+
|  1|  Manish| 26| 75000|  INDIA|     m|     Jere|
|  2|  Nikita| 23|100000|    USA|     f|     Jere|
|  3|  Pritam| 22|150000|  INDIA|     m|     Jere|
|  4|Prantosh| 17|200000|  JAPAN|     m|     Jere|
|  5|  Vikash| 31|300000|    USA|     m|     Jere|
|  6|   Rahul| 55|300000|  INDIA|     m|     Jere|
|  7|    Raju| 67|540000|    USA|     m|     Jere|
|  8| Praveen| 28| 70000|  JAPAN|     m|     Jere|
|  9|     Dev| 32|150000|  JAPAN|     m|     Jere|
| 10|  Sherin| 16| 25000| RUSSIA|     f|     Jere|
| 11|    Ragu| 12| 35000|  INDIA|     f|     Jere|
| 12|   Sweta| 43|200000|  INDIA|     f|     Jere|
| 13| Raushan| 48|650000|    USA|     m|     Jere|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|     Jere|
| 15| Prakash| 52|750000|  INDIA|     m|     Jere|
+---+--------+---+------+-------+------+---------+



In [0]:
#Using withColumn to create a new column sur_name, It creates new if not present otherwise replaces it.
employee_df.withColumn("sur_name",lit("Messi")).show()

+---+--------+---+------+-------+------+--------+
| id|    name|age|salary|address|gender|sur_name|
+---+--------+---+------+-------+------+--------+
|  1|  Manish| 26| 75000|  INDIA|     m|   Messi|
|  2|  Nikita| 23|100000|    USA|     f|   Messi|
|  3|  Pritam| 22|150000|  INDIA|     m|   Messi|
|  4|Prantosh| 17|200000|  JAPAN|     m|   Messi|
|  5|  Vikash| 31|300000|    USA|     m|   Messi|
|  6|   Rahul| 55|300000|  INDIA|     m|   Messi|
|  7|    Raju| 67|540000|    USA|     m|   Messi|
|  8| Praveen| 28| 70000|  JAPAN|     m|   Messi|
|  9|     Dev| 32|150000|  JAPAN|     m|   Messi|
| 10|  Sherin| 16| 25000| RUSSIA|     f|   Messi|
| 11|    Ragu| 12| 35000|  INDIA|     f|   Messi|
| 12|   Sweta| 43|200000|  INDIA|     f|   Messi|
| 13| Raushan| 48|650000|    USA|     m|   Messi|
| 14|  Mukesh| 36| 95000| RUSSIA|     m|   Messi|
| 15| Prakash| 52|750000|  INDIA|     m|   Messi|
+---+--------+---+------+-------+------+--------+



In [0]:
employee_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- address: string (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
#Changing data type of int to string using withColumn
employee_df.withColumn("id",col("id").cast("string")).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- address: string (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
#Removing columns
employee_df.drop(col("id"),"name").show()

+---+------+-------+------+
|age|salary|address|gender|
+---+------+-------+------+
| 26| 75000|  INDIA|     m|
| 23|100000|    USA|     f|
| 22|150000|  INDIA|     m|
| 17|200000|  JAPAN|     m|
| 31|300000|    USA|     m|
| 55|300000|  INDIA|     m|
| 67|540000|    USA|     m|
| 28| 70000|  JAPAN|     m|
| 32|150000|  JAPAN|     m|
| 16| 25000| RUSSIA|     f|
| 12| 35000|  INDIA|     f|
| 43|200000|  INDIA|     f|
| 48|650000|    USA|     m|
| 36| 95000| RUSSIA|     m|
| 52|750000|  INDIA|     m|
+---+------+-------+------+



Spark SQL

In [0]:
spark.sql("""
    select* from employee_tbl where salary > 150000 and age < 18      
          
          
          """).show()

+---+--------+---+------+-------+------+
| id|    name|age|salary|address|gender|
+---+--------+---+------+-------+------+
|  4|Prantosh| 17|200000|  JAPAN|     m|
+---+--------+---+------+-------+------+

