
# DataFrame Transformation

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("DataFrame_Transfrom").getOrCreate()

In [0]:
employee_df = spark.read.format("csv")\
                        .option("header","true")\
                        .option("inferschema","true")\
                        .option("mode","PERMISSIVE")\
                        .load("/FileStore/schnario/emp_data3.csv")
employee_df.show()

+---+--------+---+------+-----------+-----------+
| id|    name|age|salary|    address|    nominee|
+---+--------+---+------+-----------+-----------+
|  1|  Manish| 26| 75000|      Bihar|   nominee1|
|  2|  Nikita| 23|100000|Maharashtra|   nominee2|
|  3|  Pritam| 22|150000|   banglore|      India|
|  4|Prantosh| 17|200000|    kolkata|      india|
|  5|  Vikash| 31|300000|       pune|Maharashtra|
|  6|   Rahul| 55|300000|       null|       null|
+---+--------+---+------+-----------+-----------+



In [0]:
from pyspark.sql.types import StructType, ArrayType, StringType, StructField, IntegerType


emp_data = StructType([
                        StructField("id", IntegerType(), True),\
                        StructField("name", StringType(), True),\
                        StructField("age", IntegerType(), True),\
                        StructField("salary", IntegerType(), True),\
                        StructField("address", StringType(), True),\
                        StructField("nominee", StringType(), True),\
                        StructField("_corrupt_record", StringType(), True)
])

In [0]:
# Spark transformation

employee_df.select('id').show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
+---+



In [0]:
from pyspark.sql.functions import col

employee_df.select(col("id")).show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
+---+



In [0]:
from pyspark.sql.functions import expr

employee_df.select(expr("id + 5").alias("id")).show()

+---+
| id|
+---+
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
+---+



In [0]:
#from pyspark.sql.functions import concat

employee_df.select(expr("concat(name,salary)").alias("Name+salary")).show()

+--------------+
|   Name+salary|
+--------------+
|   Manish75000|
|  Nikita100000|
|  Pritam150000|
|Prantosh200000|
|  Vikash300000|
|   Rahul300000|
+--------------+

