In [5]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
import pandas as pd
from io import StringIO
from pyspark.sql.types import StringType

In [2]:
spark = SparkSession.builder.appName("deo").getOrCreate()

In [3]:
spark

In [7]:
csv_data = StringIO("""
OrderID,CustomerName,ProductCategory,Amount,OrderDate,DeliveryStatus,Discount,City,PaymentMode,CustomerSince
1001,Ali,Electronics,60000,2023-01-12,Delivered,0.10,Hyderabad,Credit Card,2020-05-01
1002,Neha,Fashion,2500,2023-02-20,Pending,0.05,Mumbai,UPI,2019-07-12
1003,Ravi,Books,1200,2023-03-15,Cancelled,0.00,Delhi,Cash,2021-01-20
1004,Sneha,Toys,1800,2023-01-25,Delivered,0.15,Bangalore,Wallet,2018-11-05
1005,Amit,Groceries,3500,2023-02-02,Returned,0.05,Chennai,Credit Card,2020-02-28
""")
csv_data = pd.read_csv(csv_data)
df = spark.createDataFrame(csv_data)
df.printSchema()
df.show()

root
 |-- OrderID: long (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- ProductCategory: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- DeliveryStatus: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- CustomerSince: string (nullable = true)

+-------+------------+---------------+------+----------+--------------+--------+---------+-----------+-------------+
|OrderID|CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|     City|PaymentMode|CustomerSince|
+-------+------------+---------------+------+----------+--------------+--------+---------+-----------+-------------+
|   1001|         Ali|    Electronics| 60000|2023-01-12|     Delivered|     0.1|Hyderabad|Credit Card|   2020-05-01|
|   1002|        Neha|        Fashion|  2500|2023-02-20|       Pending|    0.05|   Mumbai|        UPI|   2019-0

In [8]:
df.write.mode("overwrite").parquet("orders.csv")

In [9]:
df_parquet = spark.read.parquet("/content/orders.csv")
df_parquet.show()

+-------+------------+---------------+------+----------+--------------+--------+---------+-----------+-------------+
|OrderID|CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|     City|PaymentMode|CustomerSince|
+-------+------------+---------------+------+----------+--------------+--------+---------+-----------+-------------+
|   1003|        Ravi|          Books|  1200|2023-03-15|     Cancelled|     0.0|    Delhi|       Cash|   2021-01-20|
|   1004|       Sneha|           Toys|  1800|2023-01-25|     Delivered|    0.15|Bangalore|     Wallet|   2018-11-05|
|   1005|        Amit|      Groceries|  3500|2023-02-02|      Returned|    0.05|  Chennai|Credit Card|   2020-02-28|
|   1001|         Ali|    Electronics| 60000|2023-01-12|     Delivered|     0.1|Hyderabad|Credit Card|   2020-05-01|
|   1002|        Neha|        Fashion|  2500|2023-02-20|       Pending|    0.05|   Mumbai|        UPI|   2019-07-12|
+-------+------------+---------------+------+----------+--------

In [17]:
complex_json = '''
[
  {
    "orderId": 101,
    "customer": {"name": "Ali", "city": "Hyderabad"},
    "items": [{"product": "Laptop", "price": 60000}, {"product": "Mouse", "price": 800}]
  },
  {
    "orderId": 102,
    "customer": {"name": "Neha", "city": "Mumbai"},
    "items": [{"product": "Chair", "price": 2000}]
  }
]
'''

with open("complex.json", "w") as f:
  f.write(complex_json)


In [20]:
df_complex = spark.read.option("multiline", "true").json("/content/complex.json")

In [21]:
df_complex.write.mode("overwrite").parquet("complex_json")

In [22]:
df = spark.read.parquet(r"/content/complex_json")
df.printSchema()
df.show()

root
 |-- customer: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- price: long (nullable = true)
 |    |    |-- product: string (nullable = true)
 |-- orderId: long (nullable = true)

+----------------+--------------------+-------+
|        customer|               items|orderId|
+----------------+--------------------+-------+
|{Hyderabad, Ali}|[{60000, Laptop},...|    101|
|  {Mumbai, Neha}|     [{2000, Chair}]|    102|
+----------------+--------------------+-------+



In [23]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")
spark.sql("USE sales_db")

DataFrame[]

In [25]:
spark.sql("""
  CREATE TABLE IF NOT EXISTS orders(
    OrderID INT,
    CustomerName STRING,
    ProductCategory STRING,
    Amount DOUBLE
  ) USING PARQUET
""")

DataFrame[]

In [26]:
spark.sql("""
  INSERT INTO orders
  VALUES
    (101, 'AlI', 'Electronics', 60000),
    (102, 'Neha', 'Fashion', 2500)
""")

DataFrame[]

In [28]:
spark.sql("SELECT * FROM orders").show()

+-------+------------+---------------+-------+
|OrderID|CustomerName|ProductCategory| Amount|
+-------+------------+---------------+-------+
|    101|         AlI|    Electronics|60000.0|
|    102|        Neha|        Fashion| 2500.0|
+-------+------------+---------------+-------+

