### StructType and StructField

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType ,ArrayType


spark = SparkSession.builder.getOrCreate()

In [3]:
data = [(1,"ranj",50000) ,(2,"bobby",80000)]

schema = StructType([\
                     StructField(name="ID" , dataType= IntegerType()),\
                     StructField(name="Name" , dataType= StringType()),\
                     StructField(name="Salary" , dataType= IntegerType())
                    ])

df = spark.createDataFrame(data,schema= schema)

df.show()


  from pandas.core import (


+---+-----+------+
| ID| Name|Salary|
+---+-----+------+
|  1| ranj| 50000|
|  2|bobby| 80000|
+---+-----+------+



In [4]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



### Nested Structure or Complex structure

In [5]:
 

data = [(1,('ranjitha' , 'M'),50000) ,(2,('Bhavana','N'),80000)]

struct = StructType([\
                    StructField(name="FirstName", dataType=StringType()),\
                    StructField(name="LastName" , dataType=StringType())
                    ])

schema = StructType([\
                     StructField(name="ID" , dataType= IntegerType()),\
                     StructField(name="Name" , dataType= struct),\
                     StructField(name="Salary" , dataType= IntegerType())
                    ])


df = spark.createDataFrame(data,schema= schema)

#display(df)
df.show()
df.printSchema()

+---+-------------+------+
| ID|         Name|Salary|
+---+-------------+------+
|  1|{ranjitha, M}| 50000|
|  2| {Bhavana, N}| 80000|
+---+-------------+------+

root
 |-- ID: integer (nullable = true)
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- Salary: integer (nullable = true)



### ArrayType Column (maptype)

In [6]:


data = [('abc',[1,2]), ('xyz',[4,7]), ('pqr',[2,6])]

schema = StructType ([\
                     StructField (name="ID" , dataType=StringType()),\
                        StructField (name="numbers" , dataType=ArrayType(IntegerType()))
                     ])

df = spark.createDataFrame(data , schema= schema)

df.show()
df.printSchema()

+---+-------+
| ID|numbers|
+---+-------+
|abc| [1, 2]|
|xyz| [4, 7]|
|pqr| [2, 6]|
+---+-------+

root
 |-- ID: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)



### Accessing the Array elements


In [7]:

df.withColumn("first_number" , df.numbers[0]).show()

df.withColumn("second_number" , df.numbers[1]).show()

+---+-------+------------+
| ID|numbers|first_number|
+---+-------+------------+
|abc| [1, 2]|           1|
|xyz| [4, 7]|           4|
|pqr| [2, 6]|           2|
+---+-------+------------+

+---+-------+-------------+
| ID|numbers|second_number|
+---+-------+-------------+
|abc| [1, 2]|            2|
|xyz| [4, 7]|            7|
|pqr| [2, 6]|            6|
+---+-------+-------------+



In [8]:
data = [(1,2),(3,4)]
schema = ("1st_num","2nd_num")

df = spark.createDataFrame(data, schema)

df.show()

+-------+-------+
|1st_num|2nd_num|
+-------+-------+
|      1|      2|
|      3|      4|
+-------+-------+



### Combines the columns to array

In [9]:

from pyspark.sql.functions import col,array

data = [(1,2),(3,4)]
schema = ("1st_num","2nd_num")

df = spark.createDataFrame(data, schema)
df1 = df.withColumn('numbers',array(col("1st_num"),(col("2nd_num"))))
df1.show()
df1.printSchema()

+-------+-------+-------+
|1st_num|2nd_num|numbers|
+-------+-------+-------+
|      1|      2| [1, 2]|
|      3|      4| [3, 4]|
+-------+-------+-------+

root
 |-- 1st_num: long (nullable = true)
 |-- 2nd_num: long (nullable = true)
 |-- numbers: array (nullable = false)
 |    |-- element: long (containsNull = true)



### EXPLODE function

In [10]:

from pyspark.sql.functions import col,explode

data1 = [(1,'bob',['python','java']),(2,'raj',['aws','dotnet'])]

schema = ['ID','Names','Skills']

df = spark.createDataFrame(data1,schema)
df.show()
df.printSchema()

df1 = df.withColumn('SKILL',explode(col('Skills')))
df1 .show()




+---+-----+--------------+
| ID|Names|        Skills|
+---+-----+--------------+
|  1|  bob|[python, java]|
|  2|  raj| [aws, dotnet]|
+---+-----+--------------+

root
 |-- ID: long (nullable = true)
 |-- Names: string (nullable = true)
 |-- Skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+-----+--------------+------+
| ID|Names|        Skills| SKILL|
+---+-----+--------------+------+
|  1|  bob|[python, java]|python|
|  1|  bob|[python, java]|  java|
|  2|  raj| [aws, dotnet]|   aws|
|  2|  raj| [aws, dotnet]|dotnet|
+---+-----+--------------+------+



### array_contain function

In [11]:


from pyspark.sql.functions import col,array,array_contains

data1 = [(1,'bob',['python','java']),(2,'raj',['aws','dotnet'])]

schema = ['ID','Names','Skills']

df = spark.createDataFrame(data1,schema)

df = df.withColumn("hasJavaSkill",array_contains(col('Skills'),'java'))
df.show()
df.printSchema()



+---+-----+--------------+------------+
| ID|Names|        Skills|hasJavaSkill|
+---+-----+--------------+------------+
|  1|  bob|[python, java]|        true|
|  2|  raj| [aws, dotnet]|       false|
+---+-----+--------------+------------+

root
 |-- ID: long (nullable = true)
 |-- Names: string (nullable = true)
 |-- Skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hasJavaSkill: boolean (nullable = true)



### pivot function

In [12]:


data = [
    ("Alice", "male","Sales", 5000),\
    ("Bob","female", "Sales", 4000),\
    ("Cathy","female", "HR", 3500),\
    ("David", "male","HR", 4500),\
    ("Eve", "male", "IT", 6000),\
    ("Frank", "male","IT", 7000),
]

columns = ["Name", "Gender" ,"Department", "Salary"]

df = spark.createDataFrame(data , schema=columns)
df.show()

df.groupBy("Department","Gender").count().show()

df.groupBy("Department").pivot("Gender").count().show()

df.groupBy("Department").pivot("Gender",['male']).count().show()

+-----+------+----------+------+
| Name|Gender|Department|Salary|
+-----+------+----------+------+
|Alice|  male|     Sales|  5000|
|  Bob|female|     Sales|  4000|
|Cathy|female|        HR|  3500|
|David|  male|        HR|  4500|
|  Eve|  male|        IT|  6000|
|Frank|  male|        IT|  7000|
+-----+------+----------+------+

+----------+------+-----+
|Department|Gender|count|
+----------+------+-----+
|     Sales|  male|    1|
|     Sales|female|    1|
|        HR|female|    1|
|        HR|  male|    1|
|        IT|  male|    2|
+----------+------+-----+

+----------+------+----+
|Department|female|male|
+----------+------+----+
|     Sales|     1|   1|
|        HR|     1|   1|
|        IT|  NULL|   2|
+----------+------+----+

+----------+----+
|Department|male|
+----------+----+
|     Sales|   1|
|        HR|   1|
|        IT|   2|
+----------+----+



### unpivot function we use Stack()

In [13]:

data = [('IT',3,2),('Sales',3,6),('HR',4,3)]

schema = ['dep','male','female']

df = spark.createDataFrame(data,schema)
df.show()

from pyspark.sql.functions import expr
undf = df.select('dep' ,expr("stack(2 ,'abc',male,'female',Female) as (gender , count)"))
undf.show()

+-----+----+------+
|  dep|male|female|
+-----+----+------+
|   IT|   3|     2|
|Sales|   3|     6|
|   HR|   4|     3|
+-----+----+------+

+-----+------+-----+
|  dep|gender|count|
+-----+------+-----+
|   IT|   abc|    3|
|   IT|female|    2|
|Sales|   abc|    3|
|Sales|female|    6|
|   HR|   abc|    4|
|   HR|female|    3|
+-----+------+-----+



### MAP AND FLAT MAP

In [14]:
data = [('Bob','Cathy'),('David','Frank')]

rdd = spark.sparkContext.parallelize(data)

df = spark.createDataFrame(data, schema= ['1st_name', '2nd_name'])
df. show()

rdd1 = df.rdd.map(lambda x : x + (x[0] +' ' + x[1],))
df1 = rdd1.toDF( ['1st_name', '2nd_name','full_name'])


df1.show()

+--------+--------+
|1st_name|2nd_name|
+--------+--------+
|     Bob|   Cathy|
|   David|   Frank|
+--------+--------+

+--------+--------+-----------+
|1st_name|2nd_name|  full_name|
+--------+--------+-----------+
|     Bob|   Cathy|  Bob Cathy|
|   David|   Frank|David Frank|
+--------+--------+-----------+



In [None]:
#FlatMAp

data = ['Bob Cathy','David Frank']
rdd = spark.sparkContext.parallelize(data)

for item in rdd.collect():
    print(item)
    
rdd1 = rdd.map(lambda x: x.split(' '))  #map() function
for item in rdd1.collect():
    print(item)
    

rdd2 = rdd.flatMap(lambda x: x.split(' '))  #Flatmap() function
for item in rdd2.collect():
    print(item)

Bob Cathy
David Frank
['Bob', 'Cathy']
['David', 'Frank']
