
# DataFrame transformation part 2


Topic Covered
* Aliasing
* filter/where
* literal
* adding column
* renaming column
* casting data type
* removing column

In [0]:
employee_df = spark.read.format("csv")\
                        .option("header","true")\
                        .option("inferschema","true")\
                        .option("mode","PERMISSIVE")\
                        .load("/FileStore/schnario/emp_data3.csv")
employee_df.show()

+---+--------+---+------+-----------+-----------+
| id|    name|age|salary|    address|    nominee|
+---+--------+---+------+-----------+-----------+
|  1|  Manish| 26| 75000|      Bihar|   nominee1|
|  2|  Nikita| 23|100000|Maharashtra|   nominee2|
|  3|  Pritam| 22|150000|   banglore|      India|
|  4|Prantosh| 17|200000|    kolkata|      india|
|  5|  Vikash| 31|300000|       pune|Maharashtra|
|  6|   Rahul| 55|300000|       null|       null|
+---+--------+---+------+-----------+-----------+



In [0]:
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
# Aliasing the column
from pyspark.sql.functions import col

employee_df.select(col('id').alias("employee_id")).show()

+-----------+
|employee_id|
+-----------+
|          1|
|          2|
|          3|
|          4|
|          5|
|          6|
+-----------+



In [0]:
# From the employee_df show the result who has more salary than 100000

df_3 = employee_df.filter(col('salary') > 100000)

df_3.sort("salary").show()

+---+--------+---+------+--------+-----------+
| id|    name|age|salary| address|    nominee|
+---+--------+---+------+--------+-----------+
|  3|  Pritam| 22|150000|banglore|      India|
|  4|Prantosh| 17|200000| kolkata|      india|
|  5|  Vikash| 31|300000|    pune|Maharashtra|
|  6|   Rahul| 55|300000|    null|       null|
+---+--------+---+------+--------+-----------+



In [0]:
# From the employee_df show the result who has more salary than 100000 and age is above 25

employee_df.filter((col('salary') > 100000) & (col('age') >= 25)).show()

+---+------+---+------+-------+-----------+
| id|  name|age|salary|address|    nominee|
+---+------+---+------+-------+-----------+
|  5|Vikash| 31|300000|   pune|Maharashtra|
|  6| Rahul| 55|300000|   null|       null|
+---+------+---+------+-------+-----------+



In [0]:
# use lit function

from pyspark.sql.functions import lit

employee_df.select("*", lit("IBM").alias("company_name")).show()

+---+--------+---+------+-----------+-----------+------------+
| id|    name|age|salary|    address|    nominee|company_name|
+---+--------+---+------+-----------+-----------+------------+
|  1|  Manish| 26| 75000|      Bihar|   nominee1|         IBM|
|  2|  Nikita| 23|100000|Maharashtra|   nominee2|         IBM|
|  3|  Pritam| 22|150000|   banglore|      India|         IBM|
|  4|Prantosh| 17|200000|    kolkata|      india|         IBM|
|  5|  Vikash| 31|300000|       pune|Maharashtra|         IBM|
|  6|   Rahul| 55|300000|       null|       null|         IBM|
+---+--------+---+------+-----------+-----------+------------+



In [0]:
# Rename the column
employee_df.withColumnRenamed("id","employee_id").show()

+-----------+--------+---+------+-----------+-----------+
|employee_id|    name|age|salary|    address|    nominee|
+-----------+--------+---+------+-----------+-----------+
|          1|  Manish| 26| 75000|      Bihar|   nominee1|
|          2|  Nikita| 23|100000|Maharashtra|   nominee2|
|          3|  Pritam| 22|150000|   banglore|      India|
|          4|Prantosh| 17|200000|    kolkata|      india|
|          5|  Vikash| 31|300000|       pune|Maharashtra|
|          6|   Rahul| 55|300000|       null|       null|
+-----------+--------+---+------+-----------+-----------+



In [0]:
# Cast a id column convert from int to string


employee_df.withColumn("id", col("id").cast("string"))\
    .withColumn("salary", col("salary").cast("long")).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [0]:
# Drop the address and nominee column from the table

employee_df.drop('address','nominee').show()

+---+--------+---+------+
| id|    name|age|salary|
+---+--------+---+------+
|  1|  Manish| 26| 75000|
|  2|  Nikita| 23|100000|
|  3|  Pritam| 22|150000|
|  4|Prantosh| 17|200000|
|  5|  Vikash| 31|300000|
|  6|   Rahul| 55|300000|
+---+--------+---+------+



In [0]:
# write in sql

employee_df.createOrReplaceTempView("data")

In [0]:
%sql

select *,"IBM" as last_name from data
where age > 50;

id,name,age,salary,address,nominee,last_name
6,Rahul,55,300000,,,IBM
