# Chapter-2

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Structured Data
* Typically csv data that has a given schema and order & is human readable

## Providing Schema

In [0]:
csv_data = [("Jim","","Smith","36636","M",3000),
    ("Mike","Rose","","40288","M", 5000),
    ("Bob","","Williams","42114","M", 6000),
    ("Marie","Anne","Jones","39192","F",7000),
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("wages", IntegerType(), True) \
  ])

df = spark.createDataFrame(data=csv_data,schema=schema)
df.printSchema()
df.show(truncate=False)

## Save
* overwrite – mode is used to overwrite the existing file.
* append – To add the data to the existing file
* ignore – Ignores write operation when the file already exists.
* error – This is a default option when the file already exists, it returns an error.

In [0]:
df.write.mode('overwrite').csv('/tmp/ch2/csv_data')

## Infer schema

In [0]:
df  = spark.read.format("csv") \
      .option("header", False) \
      .option("inferSchema", True) \
      .load("/tmp/ch2/csv_data").show()

## Read
* Specify header, delimmiter, inference

In [0]:
df_with_schema = spark.read.format("csv") \
      .option("header", False) \
      .schema(schema) \
      .load("/tmp/ch2/csv_data")

df_with_schema.show()
df_with_schema.printSchema()

## Providing Nested Schema

In [0]:
csv_data = [(("Jim","","Smith"),"36636","M",3000),
    (("Mike","Rose",""),"40288","M", 5000),
    (("Bob","","Williams"),"42114","M", 6000),
    (("Marie","Anne","Jones"),"39192","F",7000),
  ]
schema = StructType([ \
   StructField("name",  StructType([ \
        StructField("firstname",StringType(),True), \
        StructField("middlename",StringType(),True), \
        StructField("lastname",StringType(),True)])), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("wages", IntegerType(), True) \
  ])

df = spark.createDataFrame(data=csv_data,schema=schema)
df.printSchema()
df.show(truncate=False)

# Semi-Structured Data
* Typically xml &. json data
* Unlike structured data, not all columns/fields are present for each record and the order is less important as the data is self-describing

In [0]:
def jsonToDataFrame(json, schema=None):
  reader = spark.read
  if schema:
    reader.schema(schema)
  return reader.json(sc.parallelize([json]))

## StructType & StructField

In [0]:
#StructType & StructField
schema = StructType() \
    .add("Person", StructType()
         .add("Name", StringType())
         .add("Age", IntegerType()))
                     
json_str = ''' 
{
  "Person": {
     "Name": "John Smith",
     "Age": 36
  }
}
'''
events = jsonToDataFrame(json_str, schema)
#Individual field access
events.select("Person.Name").show()
#Get all fields 
events.select("Person.*").show()

## Infer Schema

In [0]:
json_str = ''' 
{
  "Person": {
     "Name": "John Smith",
     "Age": 36
  }
}
'''
#Note schema is not specified
events = jsonToDataFrame(json_str)
#Individual field access
events.select("Person.Name").show()
#Get all fields from a given node
events.select("Person.*").show()
#Get all fields using alias
events.select(struct("*").alias("Citizen")).show()

## Multi-line

In [0]:
dbutils.fs.rm('/tmp/test_multilie.json', True)
dbutils.fs.put('/tmp/test_multilie.json',
               '''[{
  "RecordNumber": 2,
  "Zipcode": 704,
  "ZipCodeType": "STANDARD",
  "City": "PASEO COSTA DEL SUR",
  "State": "PR"
},
{
  "RecordNumber": 10,
  "Zipcode": 709,
  "ZipCodeType": "STANDARD",
  "City": "BDA SAN LUIS",
  "State": "PR"
}]''')

In [0]:
events = spark.read.option("multiline", True).json('/tmp/test_multilie.json')
events.show()

## Arrays

In [0]:
#StructType & StructField
schema = StructType() \
    .add("Person", StructType()
         .add("Name", ArrayType(StringType()))
         .add("Age", IntegerType()))

json_str = ''' 
{
  "Person": {
     "Name": ["John","Smith"],
     "Age": 36
  }
}
'''
events = jsonToDataFrame(json_str, schema)
#Individual field access
events.select("Person.Name").show()
#Access Individual elements of the array
events.select((col("Person.Name")).getItem(0)).show()

## Maps

In [0]:
#StructType & StructField
schema = StructType() \
    .add("Person", StructType()
         .add("Name", MapType(StringType(), StringType()))
         .add("Age", IntegerType()))

json_str = ''' 
{
  "Person": {
     "Name": {"John":"Smith"},
     "Age": 36
  }
}
'''
events = jsonToDataFrame(json_str, schema)
#Individual field access
events.select("Person.Name").show()

## from_json

In [0]:
events = jsonToDataFrame("""
{
  "Person": "{\\"Address\\":{\\"Unit\\":12,\\"Location\\":{\\"Street\\":\\"New York\\"}}}"
}
""")
 
schema = StructType().add("Address", StructType().add("Unit", IntegerType())
                                                .add("Location", StringType()))
display(events.select(from_json("Person", schema).alias("Citizen")))

Citizen
"List(List(12, {""Street"":""New York""}))"


## to_json

In [0]:
events = jsonToDataFrame("""
{
  "Person": {
     "Name": {"John":"Smith"},
     "Age": 36
  }
}
""")
 
display(events.select(to_json("Person").alias("Citizen")))

Citizen
"{""Age"":36,""Name"":{""John"":""Smith""}}"


## json_tuple

In [0]:
events = jsonToDataFrame("""
{
  "Person": "{\\"Address\\":{\\"Unit\\":12,\\"Location\\":{\\"Street\\":\\"New York\\"}}}"
}
""")
 
display(events.select(json_tuple("Person", "Address").alias("Address")))

Address
"{""Unit"":12,""Location"":{""Street"":""New York""}}"


## regexp_extract

In [0]:
events = jsonToDataFrame("""
[{ "Identity": "010-22-2345" }, 
 { "Identity": "017-26-8345" },
 { "Identity": "1-2-3" }]
""")
 
events.select(regexp_extract("Identity", "([0-9]*)-([0-9]*)-([0-9]*)", 1).alias("Identity")).show()
events.select(regexp_extract("Identity", "([0-9]*)-([0-9]*)-([0-9]*)",3).alias("Identity")).show()

## aggregation

In [0]:
events = jsonToDataFrame("""
[{ "Name": "John", "Age": 27 }, 
 { "Name": "John", "Age": 52 }]
""")
 
display(events.groupBy("Name").agg(collect_list("Age").alias("Ages")))

Name,Ages
John,"List(27, 52)"


## explode

In [0]:
events = jsonToDataFrame("""
{
 "John" : {
  "Preferences": ['Tennis', 'Cricket']
  }
}
""")
 
display(events.select(explode("John.Preferences").alias("taste")))

taste
Tennis
Cricket
