In [14]:

raw_orders = [
    ("ORD001","C001","Ravi"," Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("ORD002","C002","Sneha","Mumbai"," Mobile ","Electronics","32000","05/01/2024","Completed"),
    ("ORD003","C003","Aman","Bangalore","Laptop","Electronics","55000","2024/01/06","Completed"),
    ("ORD004","C004","Pooja","Delhi","Tablet"," Electronics ","","2024-01-07","Cancelled"),
    ("ORD005","C005","Neha","Chennai","Laptop","Electronics","48000","invalid_date","Completed"),
    ("ORD006","C006","Rahul","Mumbai","Mobile","Electronics",None,"2024-01-08","Completed"),
    ("ORD007","C007","Kiran","Bangalore","Tablet","Electronics","30000","2024-01-08","Completed"),
    ("ORD008","C008","Amit","Delhi","Laptop","electronics","45000","2024-01-09","Completed"),
    ("ORD009","C009","Priya"," Pune","Mobile","Electronics","28000","09-01-2024","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),  # duplicate
    ("ORD011","C011","Meena","Chennai","Tablet","Electronics","31000","2024-01-11","Completed"),
    ("ORD012","C012","Arjun","Delhi","Mobile","Electronics","27000","2024/01/11","Completed"),
    ("ORD013","C013","Nikhil","Bangalore","Laptop","Electronics","60000","2024-01-12","Completed"),
    ("ORD014","C014","Rohit","Mumbai","Mobile","Electronics","invalid_price","2024-01-12","Completed"),
    ("ORD015","C015","Anita","Delhi","Tablet","Electronics","29000","2024-01-13","Completed"),
    ("ORD016","C016","Vikas","Chennai","Laptop","Electronics","52000","2024-01-13","Completed"),
    ("ORD017","C017","Sunita","Mumbai","Mobile","Electronics","33000","2024-01-14","Completed"),
    ("ORD018","C018","Deepak","Bangalore","Laptop","Electronics","58000","2024-01-14","Completed"),
    ("ORD019","C019","Pallavi","Delhi","Mobile","Electronics","26000","2024-01-15","Completed"),
    ("ORD020","C020","Manish","Mumbai","Tablet","Electronics","34000","2024-01-15","Completed")
]


In [15]:
from pyspark.sql import SparkSession



spark=SparkSession.builder.getOrCreate()

In [16]:
columns=["order_id" ,"customer_id" ,"customer_name" ,"city" ,"product","category" ,"price","order_date" ,"order_status"]

In [71]:
df=spark.createDataFrame(raw_orders,columns)

In [None]:
df.show()

In [21]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_status: string (nullable = true)



In [73]:


df = df.withColumn(
    "price",
    when(lower(col("price")).isin(["unknown", "na", "null", ""]), None)
     .otherwise(col("price").try_cast("int"))
)





In [None]:
from pyspark.sql.functions import when,col,lower
df=df.withColumn("price_with_tax",col("price")*1.18)
df.show()

In [None]:
df=df.withColumn(
    "price_category",
    when(col("price")>=50000,"High")
    .when(col("price")>=30000,"Medium")
    .otherwise("Low")
)
df.show()

In [76]:

from pyspark.sql import functions as F

df = (
    df
    .withColumn("city", F.initcap(F.trim(F.col("city"))))
    .withColumn("product", F.initcap(F.trim(F.col("product"))))
    .withColumn("category", F.initcap(F.trim(F.col("category"))))
)


In [None]:

from pyspark.sql import functions as F

from pyspark.sql import functions as F

df = df.withColumn(
    "order_date",
    F.coalesce(
        F.to_date(F.trim(F.col("order_date")), "yyyy-MM-dd"),
        F.to_date(F.trim(F.col("order_date")), "dd/MM/yyyy"),
        F.to_date(F.trim(F.col("order_date")), "yyyy/MM/dd"),
        F.to_date(F.trim(F.col("order_date")), "dd-MM-yyyy")
    )
)

df.show()







In [None]:
df.show()

In [None]:
from pyspark.sql import functions as F

df = df.dropDuplicates(['order_id'])
df.show()
df = df.filter(F.col('order_status') == 'Completed')

df.show()


In [None]:
from pyspark.sql import functions as F


df = df.withColumn('order_year', F.year('order_date'))
df = df.withColumn('order_month', F.month('order_date'))

df.show()


In [83]:
city_revenue = df.groupBy('city').agg(F.sum('price').alias('total_revenue'))
city_revenue.show()


+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       203000|
|  Chennai|       131000|
|   Mumbai|       154000|
|     Pune|        28000|
|    Delhi|       172000|
+---------+-------------+



In [84]:
product_revenue = df.groupBy('product').agg(F.sum('price').alias('total_revenue'))

product_revenue.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       418000|
| Mobile|       146000|
| Tablet|       124000|
+-------+-------------+



In [85]:

top_cities = city_revenue.orderBy(F.col('total_revenue').desc()).limit(3)

top_cities.show()


+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       203000|
|    Delhi|       172000|
|   Mumbai|       154000|
+---------+-------------+



In [87]:

product_avg_price = df.groupBy('product').agg(F.avg('price').alias('avg_price'))


threshold = 30000
products_gt_threshold = product_avg_price.filter(F.col('avg_price') > threshold)

products_gt_threshold.show()


+-------+---------+
|product|avg_price|
+-------+---------+
| Laptop|  52250.0|
| Tablet|  31000.0|
+-------+---------+



In [89]:
df.write.parquet("orders_cleaned.parquet")

In [90]:

df_parquet = spark.read.parquet("orders_cleaned.parquet")

df_parquet.printSchema()
df_parquet.show(5)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_status: string (nullable = true)
 |-- price_with_tax: double (nullable = true)
 |-- price_category: string (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+----------+-----------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|order_year|order_month|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+----------+-----------+


In [91]:

df.write.format("orc").save("cleaned_orders_data.orc")


In [None]:

df.write.format("avro").save("cleaned_orders_data.avro")


In [93]:

num_partitions = df.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")


Number of partitions: 1


In [None]:

df.explain(True)


In [63]:

spark.version
spark.conf.get("spark.sql.ansi.enabled")  # 'true' may cause stricter behavior


'false'

In [62]:
spark.conf.set("spark.sql.ansi.enabled", "false")
