In [0]:
from pyspark.sql.functions import *

In [0]:
data = [("1","Ram","5000","it","male"),("2","Shyam","8000","hr","female"),("3","Jadu","6000","it","male"), ("4","Madu","10000","it","female")]
schemas = ["id", "name"  ,"sal"  , "dept"  ,"gender" ]

df = spark.createDataFrame(data,schema=schemas)
df.display() 

id,name,sal,dept,gender
1,Ram,5000,it,male
2,Shyam,8000,hr,female
3,Jadu,6000,it,male
4,Madu,10000,it,female


In [0]:
### Pivot 
df.groupBy("dept").pivot("gender").count().show()

+----+------+----+
|dept|female|male|
+----+------+----+
|  hr|     1|null|
|  it|     1|   2|
+----+------+----+



In [0]:
for i in df.collect():
    print("Id "+i[0])
    print("Name "+i[1])

Id 1
Name Ram
Id 2
Name Shyam
Id 3
Name Jadu
Id 4
Name Madu


In [0]:
type(df.collect())

Out[26]: list

Create RDD 

In [0]:
data = [("1","Ram","5000","it","male"),("2","Shyam","8000","hr","female"),("3","Jadu","6000","it","male"), ("4","Madu","10000","it","female")]

rdd1 = spark.sparkContext.parallelize(data)
type(rdd1)

df1=rdd1.toDF(["id", "name"  ,"sal"  , "dept"  ,"gender" ])
df1.show()

+---+-----+-----+----+------+
| id| name|  sal|dept|gender|
+---+-----+-----+----+------+
|  1|  Ram| 5000|  it|  male|
|  2|Shyam| 8000|  hr|female|
|  3| Jadu| 6000|  it|  male|
|  4| Madu|10000|  it|female|
+---+-----+-----+----+------+



creating rdd from a python collection

In [0]:
rdd2 = spark.sparkContext.parallelize(df.collect())
rdd3 = rdd2.map(lambda x : x[1])
rdd3.collect()

Out[43]: ['Ram', 'Shyam', 'Jadu', 'Madu']

In [0]:
### Map function

data = [("ram","saran"),("shyam","gupta")]

rdd1 = spark.sparkContext.parallelize(data)

rdd2 = rdd1.map(lambda x: x+(x[0],))
print(rdd2.collect())

[('ram', 'saran', 'ram'), ('shyam', 'gupta', 'shyam')]


In [0]:
###  Map function

data = [("ram saran"),("shyam gupta")]

rdd1 = spark.sparkContext.parallelize(data)

rdd2 = rdd1.map(lambda x: x.split(' '))
print(rdd2.collect())


[['ram', 'saran'], ['shyam', 'gupta']]


In [0]:
### Flat Map function

data = [("ram saran"),("shyam gupta")]

rdd1 = spark.sparkContext.parallelize(data)

rdd2 = rdd1.flatMap(lambda x: x.split(' '))
print(rdd2.collect())


['ram', 'saran', 'shyam', 'gupta']


In [0]:
data = [("ram",[10,20]),("shyam",[60,70]),("jadu",[20,50]),("madhu",[100,40])]
schemas = ["name","marks"]

df = spark.createDataFrame(data,schemas)
df.show()
df.printSchema()

+-----+---------+
| name|    marks|
+-----+---------+
|  ram| [10, 20]|
|shyam| [60, 70]|
| jadu| [20, 50]|
|madhu|[100, 40]|
+-----+---------+

root
 |-- name: string (nullable = true)
 |-- marks: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [0]:
data = [("ram",{"hair":"black","eye":"black"}),("shyam",{"hair":"black","eye":"black"}),("jadu",{"hair":"black","eye":"black"}),("madhu",{"hair":"black","eye":"black"})]

schemas = ["name","properties"]

df = spark.createDataFrame(data,schemas)
df.show()
df.printSchema()

+-----+--------------------+
| name|          properties|
+-----+--------------------+
|  ram|{eye -> black, ha...|
|shyam|{eye -> black, ha...|
| jadu|{eye -> black, ha...|
|madhu|{eye -> black, ha...|
+-----+--------------------+

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [0]:
#read the jason file 
rest_json = spark.read.format("json").option("multiLine",True).load("/FileStore/tables/file1.json")

In [0]:
rest_json.select("*",explode("restaurants").alias("new_resturant"),"new_resturant.restaurant.R.res_id") \
.select("*",explode("new_resturant.restaurant.establishment_types").alias("new_establishment")).drop("restaurants").printSchema() 

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- new_resturant: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- book_url: string (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_deliver

In [0]:
help(spark.read.json)

Help on method json in module pyspark.sql.readwriter:

json(path: Union[str, List[str], pyspark.rdd.RDD[str]], schema: Union[pyspark.sql.types.StructType, str, NoneType] = None, primitivesAsString: Union[bool, str, NoneType] = None, prefersDecimal: Union[bool, str, NoneType] = None, allowComments: Union[bool, str, NoneType] = None, allowUnquotedFieldNames: Union[bool, str, NoneType] = None, allowSingleQuotes: Union[bool, str, NoneType] = None, allowNumericLeadingZero: Union[bool, str, NoneType] = None, allowBackslashEscapingAnyCharacter: Union[bool, str, NoneType] = None, mode: Optional[str] = None, columnNameOfCorruptRecord: Optional[str] = None, dateFormat: Optional[str] = None, timestampFormat: Optional[str] = None, multiLine: Union[bool, str, NoneType] = None, allowUnquotedControlChars: Union[bool, str, NoneType] = None, lineSep: Optional[str] = None, samplingRatio: Union[str, float, NoneType] = None, dropFieldIfAllNull: Union[bool, str, NoneType] = None, encoding: Optional[str] = 