# CREATE DATAFRAME

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-create-dataframe")
    .getOrCreate()
)

In [2]:
colorDf = spark.createDataFrame([('red',), ('blue',), ('green',)], ['color'])
colorDf.show()

+-----+
|color|
+-----+
|  red|
| blue|
|green|
+-----+



In [6]:
# NameError: name 'CharType' is not defined
tmp = spark.createDataFrame((("summer", 4.5), ("winter", 7.5)), 
                            StructType([StructField("season", CharType()), 
                                          StructField("season", DoubleType())]))

NameError: name 'CharType' is not defined

In [7]:
tmp = spark.createDataFrame([("summer", 4.5), ("winter", 7.5)], ["season", "wind_speed_ms"])
tmp.printSchema()
tmp.show()

root
 |-- season: string (nullable = true)
 |-- wind_speed_ms: double (nullable = true)

+------+-------------+
|season|wind_speed_ms|
+------+-------------+
|summer|          4.5|
|winter|          7.5|
+------+-------------+



In which order should the code blocks shown below be run in order to create a table of all values in column attributes next to the respective values in column supplier in DataFrame itemsDf?
>
- `1. itemsDf.createOrReplaceView(“itemsDf”)`
- `2. spark.sql(“FROM itemsDf SELECT ‘supplier’, explode(‘Attributes’)”)`
- `3. spark.sql(“FROM itemsDf SELECT supplier, explode(attributes)”)`
- `4. itemsDf.createOrReplaceTempView(“itemsDf”)`

In [8]:
data = [(1, 'Thick Coat for Walking in the Snow', ['blue', 'winter', 'cozy'], 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', ['red', 'summer'], 'YetiX'),
        (3, 'Outdoors Backpack', ['green', 'summer'], 'Sports Company Inc.')]

columns = ["itemId", "itemName", "attributes", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+--------------------+-------------------+
|itemId|            itemName|          attributes|           supplier|
+------+--------------------+--------------------+-------------------+
|     1|Thick Coat for Wa...|[blue, winter, cozy]|Sports Company Inc.|
|     2|Elegant Outdoors ...|       [red, summer]|              YetiX|
|     3|   Outdoors Backpack|     [green, summer]|Sports Company Inc.|
+------+--------------------+--------------------+-------------------+



In [19]:
itemsDf.createOrReplaceTempView("itemsDf")

In [20]:
spark.sql("FROM itemsDf SELECT supplier, explode(attributes)").show()

+-------------------+------+
|           supplier|   col|
+-------------------+------+
|Sports Company Inc.|  blue|
|Sports Company Inc.|winter|
|Sports Company Inc.|  cozy|
|              YetiX|   red|
|              YetiX|summer|
|Sports Company Inc.| green|
|Sports Company Inc.|summer|
+-------------------+------+



In [14]:
# AttributeError: 'DataFrame' object has no attribute 'createOrReplaceView'
itemsDf.createOrReplaceView("itemsDf")

# AnalysisException: cannot resolve 'explode('Attributes')'
spark.sql("FROM itemsDf SELECT 'supplier', explode('Attributes')")


AttributeError: 'DataFrame' object has no attribute 'createOrReplaceView'

You are given a DataFrame which looks like below.
>
- `+—+—–+——+—-+———-+`
- `| ID|FName| LName| DOB|Department|`
- `+—+—–+——+—-+———-+`
- `|101| John| Doe|1977| Software|`
- `|102|David|Turner|1984| Support|`
- `|103|Abdul| Hamid|1978| Account|`
- `+—+—–+——+—-+———-+`
>
You are given a task to transform this DataFrame to the following structure.
>
- `+—+———————+———-+`
- `|ID |PersonalDetails |Department|`
- `+—+———————+———-+`
- `|101|[John, Doe, 1977] |Software |`
- `|102|[David, Turner, 1984]|Support |`
- `|103|[Abdul, Hamid, 1978] |Account |`
- `+—+———————+———-+
>
In this structure, the PersonalDetails is a child DataFrame inside a top-level parent DataFrame. This approach is known as creating DataFrame of DataFrames.
>
Choose the correct code for doing this transformation.
>
- `df1 = df.select("ID", "struct(FName,LName,DOB) as PersonalDetails", "Department") df1.show(truncate=0)`
- `df1 = df.selectExpr("ID", "struct(FName,LName,DOB) as PersonalDetails", "Department") df1.show(truncate=0)`
- `df1 = df.selectExpr("ID", "Array(FName,LName,DOB) as PersonalDetails", "Department") df1.show(truncate=0)`


In [2]:
data = [(101, 'John', 'Doe', 1977, 'Software'),
        (102, 'David', 'Turner', 1984, 'Support'),
        (103, 'Abdul', 'Hamid', 1978, 'Account')]

columns = ["ID", "FName", "LName", "DOB", "Department"]

df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()
df.show()

root
 |-- ID: long (nullable = true)
 |-- FName: string (nullable = true)
 |-- LName: string (nullable = true)
 |-- DOB: long (nullable = true)
 |-- Department: string (nullable = true)

+---+-----+------+----+----------+
| ID|FName| LName| DOB|Department|
+---+-----+------+----+----------+
|101| John|   Doe|1977|  Software|
|102|David|Turner|1984|   Support|
|103|Abdul| Hamid|1978|   Account|
+---+-----+------+----+----------+



In [3]:
df1 = df.selectExpr("ID", "struct(FName,LName,DOB) as PersonalDetails", "Department") 
df1.printSchema()
df1.show(truncate=0)


root
 |-- ID: long (nullable = true)
 |-- PersonalDetails: struct (nullable = false)
 |    |-- FName: string (nullable = true)
 |    |-- LName: string (nullable = true)
 |    |-- DOB: long (nullable = true)
 |-- Department: string (nullable = true)

+---+---------------------+----------+
|ID |PersonalDetails      |Department|
+---+---------------------+----------+
|101|[John, Doe, 1977]    |Software  |
|102|[David, Turner, 1984]|Support   |
|103|[Abdul, Hamid, 1978] |Account   |
+---+---------------------+----------+



In [8]:
df1.select("ID", df1.PersonalDetails.Fname.alias("Fname"), df1.PersonalDetails.LName.alias("LName"), "Department").show()

+---+-----+------+----------+
| ID|Fname| LName|Department|
+---+-----+------+----------+
|101| John|   Doe|  Software|
|102|David|Turner|   Support|
|103|Abdul| Hamid|   Account|
+---+-----+------+----------+



In [12]:
df1 = df.selectExpr("ID", "Array(FName,LName,DOB) as PersonalDetails", "Department") 
df1.printSchema()
df1.show(truncate=0)

root
 |-- ID: long (nullable = true)
 |-- PersonalDetails: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- Department: string (nullable = true)

+---+---------------------+----------+
|ID |PersonalDetails      |Department|
+---+---------------------+----------+
|101|[John, Doe, 1977]    |Software  |
|102|[David, Turner, 1984]|Support   |
|103|[Abdul, Hamid, 1978] |Account   |
+---+---------------------+----------+



In [2]:
years = [2001, 2002, 2010, 2020, 2021]
df = spark.createDataFrame(years, IntegerType())
df.show()

+-----+
|value|
+-----+
| 2001|
| 2002|
| 2010|
| 2020|
| 2021|
+-----+



In [4]:
data = [(0, 1100746394),
        (1, 1474410343),
        (2, 1116610009),
        (3, 1180035265),
        (4, 1408024997)]

columns = ["storeId", "openDate"]

df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()
df.show()

root
 |-- storeId: long (nullable = true)
 |-- openDate: long (nullable = true)

+-------+----------+
|storeId|  openDate|
+-------+----------+
|      0|1100746394|
|      1|1474410343|
|      2|1116610009|
|      3|1180035265|
|      4|1408024997|
+-------+----------+



In [7]:
simpleDateFormat = "EEEE, MMM d, yyyy h:mm a"
df.withColumn("openDateString", from_unixtime(col("openDate"), simpleDateFormat)).show(truncate=False)

+-------+----------+-------------------------------+
|storeId|openDate  |openDateString                 |
+-------+----------+-------------------------------+
|0      |1100746394|Thursday, Nov 18, 2004 12:53 AM|
|1      |1474410343|Tuesday, Sep 20, 2016 7:25 PM  |
|2      |1116610009|Friday, May 20, 2005 2:26 PM   |
|3      |1180035265|Thursday, May 24, 2007 4:34 PM |
|4      |1408024997|Thursday, Aug 14, 2014 11:03 AM|
+-------+----------+-------------------------------+



In [9]:
df.withColumn("openTimestamp", col("openDate").cast("Timestamp")) \
  .withColumn("month", month(col("openTimestamp"))) \
  .show()

+-------+----------+-------------------+-----+
|storeId|  openDate|      openTimestamp|month|
+-------+----------+-------------------+-----+
|      0|1100746394|2004-11-18 00:53:14|   11|
|      1|1474410343|2016-09-20 19:25:43|    9|
|      2|1116610009|2005-05-20 14:26:49|    5|
|      3|1180035265|2007-05-24 16:34:25|    5|
|      4|1408024997|2014-08-14 11:03:17|    8|
+-------+----------+-------------------+-----+

