In [9]:
!pip install pyspark



In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime

In [11]:
# Create Spark Session
spark = SparkSession.builder\
        .appName("Spark")\
        .enableHiveSupport()\
        .getOrCreate()

In [12]:
# Hardcoded data
data = [
    ["Product A", 1001, datetime.strptime("2023-07-20", "%Y-%m-%d"), datetime.strptime("2023-07-20 10:15:30", "%Y-%m-%d %H:%M:%S"), 29.99],
    ["Product B", 1002, datetime.strptime("2023-07-19", "%Y-%m-%d"), datetime.strptime("2023-07-19 14:20:45", "%Y-%m-%d %H:%M:%S"), 49.99],
    ["Product C", 1003, datetime.strptime("2023-07-18", "%Y-%m-%d"), datetime.strptime("2023-07-18 09:30:15", "%Y-%m-%d %H:%M:%S"), 39.99],
    ["Product D", 1004, datetime.strptime("2023-07-17", "%Y-%m-%d"), datetime.strptime("2023-07-17 16:45:00", "%Y-%m-%d %H:%M:%S"), 19.99]
]

In [13]:
# Define Schema
schema = StructType([
    StructField("Product",StringType(),True),
    StructField("ID",IntegerType(),True),
    StructField("Date",DateType(),True),
    StructField("Timestamp",TimestampType(),True),
    StructField("Price",FloatType(),True)
])

In [14]:
# Create Dataframe
df = spark.createDataFrame(data,schema)

# Print schema
df.printSchema()

# Print data
df.show()

root
 |-- Product: string (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Price: float (nullable = true)

+---------+----+----------+-------------------+-----+
|  Product|  ID|      Date|          Timestamp|Price|
+---------+----+----------+-------------------+-----+
|Product A|1001|2023-07-20|2023-07-20 10:15:30|29.99|
|Product B|1002|2023-07-19|2023-07-19 14:20:45|49.99|
|Product C|1003|2023-07-18|2023-07-18 09:30:15|39.99|
|Product D|1004|2023-07-17|2023-07-17 16:45:00|19.99|
+---------+----+----------+-------------------+-----+



In [15]:
# First read example should not infer schema, igonre header row, provide explicit column name and datatype

# Define schema
schema = StructType([
    StructField("order_id",StringType(),True),
    StructField("order_item_id",IntegerType(),True),
    StructField("product_id",StringType(),True),
    StructField("seller_id",StringType(),True),
    StructField("shopping_limit_date",TimestampType(),True),
    StructField("price",DoubleType(),True),
    StructField("freight_value",DoubleType(),True)
])

path = "/content/order_items_dataset.csv"
df = spark.read.format("csv").option("header","true").option("inferSchema","false").schema(schema).load(path)

df.printSchema()

df.show(5)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shopping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shopping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48

In [16]:
# Second read example should infer schema, ignore header row
df2 = spark.read.format("csv").option("header","true").option("inferSchema","true").load(path)

# print schema and sample data
df2.printSchema()
df2.show(5)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48

In [17]:
# Number of partitions
print(f"Number of partitions: {df2.rdd.getNumPartitions()}")

df3 = df2.repartition(10)

# Number of partitions after repartition
print(f"Number of partitions after repartition: {df3.rdd.getNumPartitions()}")

Number of partitions: 2
Number of partitions after repartition: 10


In [18]:
# Select columns in different options
from pyspark.sql.functions import *

df3.select("order_id").show(5)
df3.select("order_id","shipping_limit_date").show(5)
df3.select(col("order_id"),col("shipping_limit_date")).show(5)
df3.select(col("order_id").alias("ord_id"),col("shipping_limit_date").alias("limit_date")).show(5)

+--------------------+
|            order_id|
+--------------------+
|6299bb8e855289b41...|
|71fbb9971d84bf97a...|
|74322a01b770c2ea3...|
|a23fc2b3af4f1a48e...|
|747af114bbea56ac1...|
+--------------------+
only showing top 5 rows

+--------------------+-------------------+
|            order_id|shipping_limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1...|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca...|2018-06-14 09:52:04|
|51c3d73e0e9052253...|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99...|2018-02-07 20:14:08|
|3a1400b5d4dd3082a...|2018-03-27 17:28:20|
+--------------------+-------------------+
only showing top 5 rows

+--------------------+-------------------+
|            order_id|shipping_limit_date|
+--------------------+-------------------+
|3bbf8f927f288e4a1...|2017-11-09 14:25:38|
|50c40cfcbb6ce3fca...|2018-06-14 09:52:04|
|51c3d73e0e9052253...|2018-02-22 19:15:27|
|183ee0e3ebd4c1c99...|2018-02-07 20:14:08|
|3a1400b5d4dd3082a...|2018-03-27 17:28:20|
+-----------

In [19]:
# Derive new column using withColumn
df4 = df3.withColumn("year",year(col("shipping_limit_date")))\
          .withColumn("month",month(col("shipping_limit_date")))

df4.select("order_id","shipping_limit_date","year","month").show(5)

+--------------------+-------------------+----+-----+
|            order_id|shipping_limit_date|year|month|
+--------------------+-------------------+----+-----+
|3bbf8f927f288e4a1...|2017-11-09 14:25:38|2017|   11|
|50c40cfcbb6ce3fca...|2018-06-14 09:52:04|2018|    6|
|51c3d73e0e9052253...|2018-02-22 19:15:27|2018|    2|
|183ee0e3ebd4c1c99...|2018-02-07 20:14:08|2018|    2|
|3a1400b5d4dd3082a...|2018-03-27 17:28:20|2018|    3|
+--------------------+-------------------+----+-----+
only showing top 5 rows



In [20]:
# Rename existing column using withColumnRenamed
df5 = df4.withColumnRenamed("shipping_limit_date","shipping_limit_datetime")
df5.select("order_id","shipping_limit_datetime").show(5)

+--------------------+-----------------------+
|            order_id|shipping_limit_datetime|
+--------------------+-----------------------+
|3bbf8f927f288e4a1...|    2017-11-09 14:25:38|
|50c40cfcbb6ce3fca...|    2018-06-14 09:52:04|
|51c3d73e0e9052253...|    2018-02-22 19:15:27|
|183ee0e3ebd4c1c99...|    2018-02-07 20:14:08|
|3a1400b5d4dd3082a...|    2018-03-27 17:28:20|
+--------------------+-----------------------+
only showing top 5 rows



In [21]:
# Filter condition
df5.filter(col("order_id") == '00010242fe8c5a6d1ba2dd792cb16214').show()

order_li = ['00010242fe8c5a6d1ba2dd792cb16214','00018f77f2f0320c557190d7a144bdd3']
df5.filter(col("order_id").isin(order_li)).show(5)

df5.filter((col("price")<50) & (col("freight_value") < 10)).show(5)

# SQL Type expression
df5.filter("price < 50 and freight_value < 10").show(5)


+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|    2017-09-19 09:45:35| 58.9|        13.29|2017|    9|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+----

In [22]:
# Example for drop duplicates on multiple columns

df5.drop("month").show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+
|1e1bb536916a99649...|            2|0288f8dd74b931b4e...|1da3aeb70d7989d1e...|    2017-09-05 12:10:11| 49.99|        21.15|2017|
|62a0e822dd605871a...|            1|31dbb0d1815bdc83c...|6da1992f915d77be9...|    2017-06-08 11:50:18|  29.0|        15.79|2017|
|025c72e88fbf2358b...|            2|bef21943bc2335188...|e49c26c3edfa46d22...|    2017-03-21 21:24:27|  19.9|         20.8|2017|
|23d16dddab46fd3d0...|            1|cca8e09ba6f2d35e4...|43f8c9950d11ecd03...|    2018-01-31 22:17:51|109.99|        14.52|2018|
|71c0d1686c9b55563...|            2|eb6c2ecde53034fc9...|1025f0e2d44d7041d...|    2017-12-01 19:3

In [23]:
# drop duplicates row based on multiple columns
df5.dropDuplicates(['order_id','order_item_id']).show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|    2017-05-03 11:05:13| 239.9|        19.93|2017|    5|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|    2017-02-13 13:57:51| 199.9|        18.14|2017|    2|
|0005a1a1728c9d785...|            1|310ae3c140ff94b03...|a416b6a846a117243...|    2018-03-26 18:31:29|145.95|        11.65|2018|    3|
|00061f2a7bc09da83...|            1|d63c1011f49d98b97...|cc419e0650a3c5ba7...|    2018-03-29 22:28:09| 59.99|         8.88|2018|    3|
|00063b381e2406b52...|            1|f177554ea93259a5b..

In [24]:
# get distinct rows
df5.distinct().show(5)

df5.dropDuplicates().show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|9459fe4630e1165c1...|            2|8db00478f9801fc2f...|8a32e327fe2c1b351...|    2017-11-28 10:51:06|12.99|         9.34|2017|   11|
|3fb11bd2ea68c2502...|            1|f908d3bf313a1308b...|25e6ffe976bd75618...|    2017-12-07 13:11:22| 35.0|        11.85|2017|   12|
|18ed848509774f56c...|            1|309dd69eb83cea38c...|0b35c634521043bf4...|    2018-02-09 13:30:39|49.99|         15.1|2018|    2|
|0f3f4f23f10a3e4ef...|            1|bb42f37fc3d9130e4...|da8622b14eb17ae28...|    2018-04-12 19:09:20| 44.9|        18.23|2018|    4|
|9b482241f75217c3e...|            1|74706845a60b5ad45...|f5a59

In [26]:
# arrange data using order by

df5.orderBy(col("price").desc()).show(5)

df5.orderBy(col("price").asc(),col("freight_value").desc()).show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+
|0812eb902a67711a1...|            1|489ae2aa008f02150...|e3b4998c7a498169d...|    2017-02-16 20:37:36|6735.0|       194.31|2017|    2|
|fefacc66af859508b...|            1|69c590f7ffc7bf8db...|80ceebb4ee9b31afb...|    2018-08-02 04:05:13|6729.0|       193.21|2018|    8|
|f5136e38d1a14a4db...|            1|1bdf5e6731585cf01...|ee27a8f15b1dded4d...|    2017-06-15 02:45:17|6499.0|       227.66|2017|    6|
|a96610ab360d42a2e...|            1|a6492cc69376c469a...|59417c56835dd8e2e...|    2017-04-18 13:25:18|4799.0|       151.34|2017|    4|
|199af31afc78c699f...|            1|c3ed642d592594bb6..

In [29]:
# GroupBy Operations

# On single column
df5.groupBy("year").agg(count('*').alias("toal_count"),\
                        avg("price").alias("avg_price"),\
                        sum("price").alias("sum_price"),\
                        min("price").alias("min_price"),\
                        max("price").alias("max_price")).show(5)

# On multiple columns
df5.groupBy("year","month").agg(count('*').alias("total_count"),\
                                avg("price").alias("avg_price"),\
                                sum("price").alias("sum_price"),\
                                min("price").alias("min_price"),\
                                max("price").alias("max_price")
                                ).orderBy(col("year").asc(),col("month").asc()).show(5)

+----+----------+------------------+-----------------+---------+---------+
|year|toal_count|         avg_price|        sum_price|min_price|max_price|
+----+----------+------------------+-----------------+---------+---------+
|2018|     62511|120.08515685239729|7506643.240000207|     0.85|   6729.0|
|2016|       370|134.55654054054054|         49785.92|      6.0|   1399.0|
|2017|     49765|121.26732804179925| 6034868.58000014|      1.2|   6735.0|
|2020|         4|             86.49|           345.96|    69.99|    99.99|
+----+----------+------------------+-----------------+---------+---------+

+----+-----+-----------+------------------+------------------+---------+---------+
|year|month|total_count|         avg_price|         sum_price|min_price|max_price|
+----+-----+-----------+------------------+------------------+---------+---------+
|2016|    9|          4| 48.61750000000001|194.47000000000003|    44.99|     59.5|
|2016|   10|        365|135.83712328767123|49580.549999999996|     

In [32]:
# fill missing data with default value
df5.fillna({"price":0,"freight_value":0}).orderBy(col("freight_value").asc()).show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+
|97d6279a91ffe03ed...|            1|422879e10f4668299...|1f50f920176fa81da...|    2018-05-11 17:30:06| 53.9|          0.0|2018|    5|
|4296bf3920be4eaa2...|            2|53b36df67ebb7c415...|7d13fca1522535862...|    2018-05-14 20:55:00| 99.9|          0.0|2018|    5|
|2cacbc4da6d979f44...|            1|53b36df67ebb7c415...|7d13fca1522535862...|    2018-05-11 20:30:39| 99.9|          0.0|2018|    5|
|185a17e610fa1a503...|            1|aca2eb7d00ea1a7b8...|955fee9216a65b617...|    2018-04-25 13:10:39| 69.9|          0.0|2018|    4|
|04105b54650921ca3...|            1|aca2eb7d00ea1a7b8...|955fe

In [33]:
accum = spark.sparkContext.accumulator(0)

df5.foreach(lambda row: accum.add(row['price']))
print(accum.value) # Accessed by driver

13591643.699999437


In [36]:
# Case-When statement in

df5.withColumn("price_category", when(col("price") >= 100,"High")\
               .when((col("price") < 100) & (col("price") >= 50), "Medium")\
                .otherwise("Low")).show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+--------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|price_category|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+--------------+
|1e1bb536916a99649...|            2|0288f8dd74b931b4e...|1da3aeb70d7989d1e...|    2017-09-05 12:10:11| 49.99|        21.15|2017|    9|           Low|
|62a0e822dd605871a...|            1|31dbb0d1815bdc83c...|6da1992f915d77be9...|    2017-06-08 11:50:18|  29.0|        15.79|2017|    6|           Low|
|025c72e88fbf2358b...|            2|bef21943bc2335188...|e49c26c3edfa46d22...|    2017-03-21 21:24:27|  19.9|         20.8|2017|    3|           Low|
|23d16dddab46fd3d0...|            1|cca8e09ba6f2d35e4...|43f8c9950d11ecd03...|    2018-01-31 22:17:5

In [38]:
# Window functions

from pyspark.sql.window import Window

windowSpec1 = Window.partitionBy("year").orderBy(col("price").asc())
df5.withColumn("dense_rank",dense_rank().over(windowSpec1)).show(5)

windowSpec2 = Window.partitionBy("year").orderBy(col("shipping_limit_datetime").asc())
df5.withColumn("running_sum",sum("price").over(windowSpec2)).show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+----------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime|price|freight_value|year|month|dense_rank|
+--------------------+-------------+--------------------+--------------------+-----------------------+-----+-------------+----+-----+----------+
|3ee6513ae7ea23bdf...|            1|8a3254bee785a526d...|96804ea39d96eb908...|    2018-05-04 03:55:26| 0.85|        18.23|2018|    5|         1|
|6e864b3f0ec710311...|            1|8a3254bee785a526d...|96804ea39d96eb908...|    2018-05-02 20:30:34| 0.85|        18.23|2018|    5|         1|
|c5bdd8ef3c0ec4202...|            2|8a3254bee785a526d...|96804ea39d96eb908...|    2018-05-07 02:55:22| 0.85|         22.3|2018|    5|         1|
|f1d5c2e6867fa93ce...|            1|46fce52cef5caa7cc...|2d2322d8421188677...|    2018-08-28 21:30:15|  2.2|         7.39|2018|   

In [40]:
# Sellers dataset
s_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sellers_dataset.csv")

# Print schema and data
s_df.printSchema()
s_df.show(5)

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: integer (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)

+--------------------+----------------------+-----------------+------------+
|           seller_id|seller_zip_code_prefix|      seller_city|seller_state|
+--------------------+----------------------+-----------------+------------+
|3442f8959a84dea7e...|                 13023|         campinas|          SP|
|d1b65fc7debc3361e...|                 13844|       mogi guacu|          SP|
|ce3ad9de960102d06...|                 20031|   rio de janeiro|          RJ|
|c0f3eea2e14555b6f...|                  4195|        sao paulo|          SP|
|51a04a8a6bdcb23de...|                 12914|braganca paulista|          SP|
+--------------------+----------------------+-----------------+------------+
only showing top 5 rows



In [41]:
# Perform join

result1 = df5.join(broadcast(s_df),df5.seller_id == s_df.seller_id,"inner").drop(s_df.seller_id)
result1.show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+----------------------+-----------+------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|seller_zip_code_prefix|seller_city|seller_state|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+----------------------+-----------+------------+
|1e1bb536916a99649...|            2|0288f8dd74b931b4e...|1da3aeb70d7989d1e...|    2017-09-05 12:10:11| 49.99|        21.15|2017|    9|                  4265|  sao paulo|          SP|
|62a0e822dd605871a...|            1|31dbb0d1815bdc83c...|6da1992f915d77be9...|    2017-06-08 11:50:18|  29.0|        15.79|2017|    6|                  1026|  sao paulo|          SP|
|025c72e88fbf2358b...|            2|bef21943bc2335188...|e49c26c3edfa46d22...|    201

In [42]:
# Perform join with alias names of dataframes

result2 = df5.alias('oid').join(s_df.alias('sid'),col("oid.seller_id")==col("sid.seller_id"),"inner").drop(col("sid.seller_id"))
result2.show(5)

+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+----------------------+-----------+------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_datetime| price|freight_value|year|month|seller_zip_code_prefix|seller_city|seller_state|
+--------------------+-------------+--------------------+--------------------+-----------------------+------+-------------+----+-----+----------------------+-----------+------------+
|1e1bb536916a99649...|            2|0288f8dd74b931b4e...|1da3aeb70d7989d1e...|    2017-09-05 12:10:11| 49.99|        21.15|2017|    9|                  4265|  sao paulo|          SP|
|62a0e822dd605871a...|            1|31dbb0d1815bdc83c...|6da1992f915d77be9...|    2017-06-08 11:50:18|  29.0|        15.79|2017|    6|                  1026|  sao paulo|          SP|
|025c72e88fbf2358b...|            2|bef21943bc2335188...|e49c26c3edfa46d22...|    201

In [47]:
# Work with spark SQL
df5.createOrReplaceTempView("ORDER_ITEM")
s_df.createOrReplaceTempView("SELLERS")


joinDF2 = spark.sql("select * from ORDER_ITEM oid INNER JOIN SELLERS sid ON oid.seller_id == sid.seller_id")

In [48]:
# Write data without any partition key

result1.write.format("csv").option("header","true").option("delimiter",",").save("/content/output/")
print("write Successfull")

write Successfull


In [49]:
# Write data with partition key

result1.write.partitionBy("year").format("csv").option("header","true").option("delimiter",",").save("/content/output_withPartition/")
print("Write Successfull")

Write Successfull


In [51]:
# Write data in single file
result1.coalesce(1).write.format("csv").option("header","true").option("delimiter",",").save("/content/output_singleFile/")
print("Write Successfull")

Write Successfull


In [54]:
# Create database

spark.sql("CREATE SCHEMA IF NOT EXISTS tables_by_spark")

DataFrame[]

In [55]:
# Write data in Hive directly

spark.sql("""set hive.exec.dynamic.partition.mode=nonstrict""")
spark.sql("USE tables_by_spark")

# Create a partitioned Hive Table
spark.sql("""
      CREATE TABLE IF NOT EXISTS order_sellers_data(
        order_id STRING,
        order_item_id INT,
        product_id STRING,
        price DOUBLE,
        freight_value DOUBLE,
        seller_city STRING
      ) PARTITIONED BY (year INT)
""")

# Write Dataframe into Hive Table
result1.select("order_id","order_item_id","product_id","price","freight_value","seller_city","year").write.mode("append").insertInto("order_sellers_data")

In [62]:
spark.sql("SELECT * FROM order_sellers_data LIMIT 5").show()

+--------------------+-------------+--------------------+-----+-------------+-----------+----+
|            order_id|order_item_id|          product_id|price|freight_value|seller_city|year|
+--------------------+-------------+--------------------+-----+-------------+-----------+----+
|13bdf405f961a6dee...|            1|96ea060e41bdecc64...|69.99|        14.66|  jacutinga|2020|
|9c94a4ea2f7876660...|            1|282b126b2354516c5...|75.99|         14.7|  jacutinga|2020|
|c2bb89b5c1dd978d5...|            2|87b92e06b320e803d...|99.99|        61.44|  jacutinga|2020|
|c2bb89b5c1dd978d5...|            1|87b92e06b320e803d...|99.99|        61.44|  jacutinga|2020|
|7559c51df991c6861...|            1|21fb5057dd6a737df...| 49.0|        20.49|   sorocaba|2016|
+--------------------+-------------+--------------------+-----+-------------+-----------+----+

