## Transformations in Spark (DataFrame API and Spark SQL)

## DataFrame API

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master ("local[5]")\
        .appName("DataFrameAPIandSQL")\
        .getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/17 22:55:31 WARN Utils: Your hostname, Shrees-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.183.253.103 instead (on interface en0)
25/11/17 22:55:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/17 22:55:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_df = spark.read.format("csv")\
    .option("header","true")\
    .option("inferschema","true")\
    .option("mode","PERMISSIVE")\
    .load("data.csv")

In [4]:
data_df.show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        NULL| nominee|
+---+--------+---+------+------------+--------+



In [5]:
data_schema = StructType(
                            [StructField("id",IntegerType(),True),
                             StructField("name",StringType(),True),
                             StructField("age",IntegerType(),True),
                             StructField("salary",IntegerType(),True),
                             StructField("address",StringType(),True),
                             StructField("nominee",StringType(),True),
                             StructField("corrupt_record",StringType(),True)
                            ])

In [6]:
data_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [8]:
data_df.select("name").show() # String Method

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [9]:
data_df.select(col("name")).show() # Column Method

+--------+
|    name|
+--------+
|  Manish|
|  Nikita|
|  Pritam|
|Prantosh|
|  Vikash|
+--------+



In [10]:
data_df.select('id', 'name', 'age').show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [11]:
data_df.select(col('id'), col('name'), col('age')).show()

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|  Manish| 26|
|  2|  Nikita| 23|
|  3|  Pritam| 22|
|  4|Prantosh| 17|
|  5|  Vikash| 31|
+---+--------+---+



In [12]:
data_df.select(expr("id + 5")).show()

+--------+
|(id + 5)|
+--------+
|       6|
|       7|
|       8|
|       9|
|      10|
+--------+



## Spark SQL

In [14]:
data_df.createOrReplaceTempView("data_tbl")

In [15]:
spark.sql ("""

Select * from data_tbl

""").show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        NULL| nominee|
+---+--------+---+------+------------+--------+



In [16]:
data_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



## Transformations in Spark (Spark DataFrame)

In [31]:
## 1. Aliasing - Alternate / Replace name of the existing column
data_df.select(col("id").alias("employee_id"), "name", "age").show()

+-----------+--------+---+
|employee_id|    name|age|
+-----------+--------+---+
|          1|  Manish| 26|
|          2|  Nikita| 23|
|          3|  Pritam| 22|
|          4|Prantosh| 17|
|          5|  Vikash| 31|
+-----------+--------+---+



In [33]:
## 2. Filter / Where
data_df.filter(col("salary")>150000).show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
|  5|  Vikash| 31|300000|   NULL|nominee|
+---+--------+---+------+-------+-------+



In [41]:
## 'and' / 'or'
data_df.filter((col("salary")>150000) & (col("age")<18)).show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
+---+--------+---+------+-------+-------+



In [47]:
## 3. Literal
data_df.select("*", lit("Kumar").alias("last_name")).show()

+---+--------+---+------+------------+--------+---------+
| id|    name|age|salary|     address| nominee|last_name|
+---+--------+---+------+------------+--------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|    Kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|    Kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|    Kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|    Kumar|
|  5|  Vikash| 31|300000|        NULL| nominee|    Kumar|
+---+--------+---+------+------------+--------+---------+



In [49]:
## 4. With Column
data_df.withColumn("surname", lit("Singh")).show()

+---+--------+---+------+------------+--------+-------+
| id|    name|age|salary|     address| nominee|surname|
+---+--------+---+------+------------+--------+-------+
|  1|  Manish| 26| 75000|       bihar|nominee1|  Singh|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|  Singh|
|  3|  Pritam| 22|150000|   Bangalore|   India|  Singh|
|  4|Prantosh| 17|200000|     Kolkata|   India|  Singh|
|  5|  Vikash| 31|300000|        NULL| nominee|  Singh|
+---+--------+---+------+------------+--------+-------+



In [53]:
data_df.withColumnRenamed("id", "employee_id").show()

+-----------+--------+---+------+------------+--------+
|employee_id|    name|age|salary|     address| nominee|
+-----------+--------+---+------+------------+--------+
|          1|  Manish| 26| 75000|       bihar|nominee1|
|          2|  Nikita| 23|100000|uttarpradesh|nominee2|
|          3|  Pritam| 22|150000|   Bangalore|   India|
|          4|Prantosh| 17|200000|     Kolkata|   India|
|          5|  Vikash| 31|300000|        NULL| nominee|
+-----------+--------+---+------+------------+--------+



In [59]:
## 5. Casting data types 
new_data_df = data_df.withColumnRenamed("id", "employee_id")
new_data_df.show()

+-----------+--------+---+------+------------+--------+
|employee_id|    name|age|salary|     address| nominee|
+-----------+--------+---+------+------------+--------+
|          1|  Manish| 26| 75000|       bihar|nominee1|
|          2|  Nikita| 23|100000|uttarpradesh|nominee2|
|          3|  Pritam| 22|150000|   Bangalore|   India|
|          4|Prantosh| 17|200000|     Kolkata|   India|
|          5|  Vikash| 31|300000|        NULL| nominee|
+-----------+--------+---+------+------------+--------+



## Transformation (Spark SQL)

In [65]:
data_df.createOrReplaceTempView("data_tbl")

spark.sql ("""

Select * from data_tbl

""").show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        NULL| nominee|
+---+--------+---+------+------------+--------+



In [77]:
spark.sql ("""

select * from data_tbl where salary > 150000 and age < 18

""").show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
+---+--------+---+------+-------+-------+



In [81]:
spark.sql ("""

select id, name, salary, address from data_tbl

""").show()

+---+--------+------+------------+
| id|    name|salary|     address|
+---+--------+------+------------+
|  1|  Manish| 75000|       bihar|
|  2|  Nikita|100000|uttarpradesh|
|  3|  Pritam|150000|   Bangalore|
|  4|Prantosh|200000|     Kolkata|
|  5|  Vikash|300000|        NULL|
+---+--------+------+------------+

