In [0]:
from pyspark.sql.types import *

schema = StructType([
    StructField("Data.Precipitation", DoubleType(), True),
    StructField("Date.Full", DateType(), True),
    StructField("Date.Month", IntegerType(), True),
    StructField("Date.Week of", IntegerType(), True),
    StructField("Date.Year", IntegerType(), True),
    StructField("Station.City", StringType(), True),
    StructField("Station.Code", StringType(), True),
    StructField("Station.Location", StringType(), True),
    StructField("Station.State", StringType(), True),
    StructField("Data.Temperature.Avg Temp", IntegerType(), True),
    StructField("Data.Temperature.Max Temp", IntegerType(), True),
    StructField("Data.Temperature.Min Temp", IntegerType(), True),
    StructField("Data.Wind.Direction", IntegerType(), True),
    StructField("Data.Wind.Speed", DoubleType(), True)
])

df = spark.read.csv("/FileStore/tables/weather.csv", 
                    schema=schema, 
                    sep=",")
# df.display()
df.printSchema()

In [0]:
df.display()

In [0]:
from pyspark.sql.types import *

data = [
    ("James,,Smith", ["Java", "Scala", "C++"], ["Spark", "Java"], "OH", "CA"),
    ("Michael,,Rose,", ["Spark", "Java", "C++"], ["Spark", "Java"], "NY", "NJ"),
    ("Robert,,Williams", ["CSharp", "VB"], ["Spark", "Python"], "UT", "NV")
]

schema = StructType([
    StructField("name", StringType(), True),
    StructField("languages_at_school", ArrayType(StringType()), True),
    StructField("languages_at_work", ArrayType(StringType()), True),
    StructField("current_state", StringType(), True),
    StructField("previous_state", StringType(), True)
])

df = spark.createDataFrame(data, schema=schema)

df.printSchema()


In [0]:
df.display()

In [0]:
from pyspark.sql.functions import explode
df2 = df.select("name", explode("languages_at_school"))

In [0]:
df2.display()

In [0]:
df2 = df.select("name", explode("languages_at_school"), explode("languages_at_work"))
df2.display()

In [0]:
from pyspark.sql.functions import split
df3 = df.select(split("name", ",,").alias("firstMiddleAndLast"))
df3.display()

In [0]:
from pyspark.sql.functions import array_contains
df4 = df.filter(array_contains(df.languages_at_school, "Java")).select("name", "languages_at_school")

df4.display()


In [0]:
from pyspark.sql.functions import array

df5 = df.select("name", array("current_state", "previous_state").alias("state"))

df5.display()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, MapType

dataDictionary = [
    ('James', {'hair': 'black'}),
    ('Michael', {'hair': 'brown'}),
    ('Robert', {'hair': 'red'}),
    ('Washington', {'hair': 'grey'}),
    ('Jefferson', {'hair': 'brown'})
]

schema = StructType([
    StructField("name", StringType(), True),
    StructField("properties", MapType(StringType(), StringType()), True)
])

df = spark.createDataFrame(data=dataDictionary, schema=schema)

df.display()
df.printSchema()


In [0]:
from pyspark.sql.functions import map_keys, map_values

df1 = df.select("name", map_keys("properties").alias("keys"))
df1.display()
df2 = df.select("name",  map_values("properties").alias("values"))
df2.display()

In [0]:
df3 = df.select(explode(map_keys("properties")))
df3.display()

In [0]:
rdd = spark.sparkContext.parallelize([("James", 25), ("Nitesh", 30)])
