# Master DataFrame Assignment – Retail Sales Superstore Dataset (All-In-One)

Dataset Creation

In [4]:
# Create and save superstore.csv
data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000"""

with open("superstore.csv", "w") as f:
    f.write(data)


# PART 1: Pandas DataFrame Operations

1. Load the CSV using pandas

In [None]:
import pandas as pd
df = pd.read_csv("superstore.csv")

2. Print schema, head, shape, dtypes

In [None]:
print("First 5 rows:")
print(df.head())

print("\n Shape of DataFrame:",df.shape)

print("/nData Types:")
print(df.dtypes)


First 5 rows:
   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  

 Shape of DataFrame: (5, 12)
/nData Types:
OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          objec

3. Select Customer, Product, Profit columns

In [None]:
selected_columns = df[["Customer","Product","Profit"]]
print(selected_columns)

  Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


4. Filter where Profit > 2000 and Discount = 0

In [None]:
filtered = df[(df["Profit"]>2000)&(df["Discount"]==0.0)]
print(filtered)

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


5. Sort by Profit descending

In [None]:
sorted_df = df.sort_values(by="Profit", ascending=False)
print(sorted_df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


6. GroupBy Category → Total Profit, Avg Discount

In [None]:
df.groupby("Category").agg({"Profit":"sum", "Discount": "mean"})

Unnamed: 0_level_0,Profit,Discount
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Furniture,-1500,0.2
Office Supplies,150,0.05
Technology,9800,0.083333


7. Add a column `TotalPrice = Quantity * UnitPrice`

In [None]:
df["TotalPrice"] = df["Quantity"]* df ["UnitPrice"]
print(df[["OrderID", "TotalPrice"]])

   OrderID  TotalPrice
0  CA-1001       55000
1  CA-1002       24000
2  CA-1003         600
3  CA-1004       18000
4  CA-1005       40000


8. Drop the SubCategory column

In [None]:
df= df.drop("SubCategory", axis=1)
print(df.columns)

Index(['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product',
       'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit',
       'TotalPrice'],
      dtype='object')


9. Fill nulls in Discount with `0.10`

In [None]:
df["Discount"] = df["Discount"].fillna(0.10)
print(df["Discount"])

0    0.10
1    0.15
2    0.05
3    0.20
4    0.00
Name: Discount, dtype: float64


10. Apply a function to classify orders

In [None]:
def classify(row):
  if row['Profit']>4000:
    return 'High'
  elif row['Profit']>0:
    return 'Medium'
  else:
    return 'Low'

df["CategoryLevel"] = df.apply(classify, axis=1)
print(df[["OrderID","Profit","CategoryLevel"]])

   OrderID  Profit CategoryLevel
0  CA-1001    5000          High
1  CA-1002    1800        Medium
2  CA-1003     150        Medium
3  CA-1004   -1500           Low
4  CA-1005    3000        Medium


# PART 2: PySpark DataFrame Operations

PySpark Setup

In [14]:
# Download Spark from the correct URL
!wget https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz



--2025-07-31 09:00:39--  https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400395283 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.0-bin-hadoop3.tgz’


2025-07-31 09:04:46 (1.54 MB/s) - ‘spark-3.5.0-bin-hadoop3.tgz’ saved [400395283/400395283]



In [15]:
# Extract Spark
!tar -xzf spark-3.5.0-bin-hadoop3.tgz

In [16]:

!apt-get install openjdk-11-jdk -y
!pip install -q findspark

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jre
  x11-utils
Suggested packages:
  libxt-doc openjdk-11-demo openjdk-11-source visualvm mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-11-jdk
  openjdk-11-jre x11-utils
0 upgraded, 10 newly installed, 0 to remove and 35 not upgraded.
Need to get 5,367 kB of archives.
After this operation, 15.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-core all 2.37-2build1 [1,041 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 fonts-dejavu-extra all 2.37-2build1 [2,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu jam

In [17]:
#Set Environment Variables and Initialize Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RetailSalesSuperstore").getOrCreate()

1: Load CSV using PySpark

In [18]:
df = spark.read.csv("superstore.csv", header=True, inferSchema=True)

2: Show Schema and First 5 Rows

In [None]:
df.printSchema()
df.show(5)

root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

3: Select columns and Rename Customer → Client

In [None]:
df = df.withColumnRenamed("Customer", "Client")
df.select("Client","Product","Profit").show()

+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



4: Filter where Segment = 'Consumer' and Profit < 1000

In [None]:
filtered_df = df.filter((df.Segment =="Consumer")& (df.Profit<1000))
filtered_df.show()

+-------+----------+------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Client| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|  Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



5: Group by Region → Show average Profit

In [None]:
df.groupBy("Region").avg("Profit").show()

+------+-----------+
|Region|avg(Profit)|
+------+-----------+
| South|     4000.0|
|  East|      150.0|
|  West|    -1500.0|
| North|     1800.0|
+------+-----------+



 6: Add TotalPrice = Quantity * UnitPrice

In [None]:
from pyspark.sql.functions import col
df = df.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df.select("OrderID", "TotalPrice").show()

+-------+----------+
|OrderID|TotalPrice|
+-------+----------+
|CA-1001|     55000|
|CA-1002|     24000|
|CA-1003|       600|
|CA-1004|     18000|
|CA-1005|     40000|
+-------+----------+



 7: Classify Profit as High / Medium / Loss


In [None]:
from pyspark.sql.functions import when

df = df.withColumn(
    "ProfitCategory",
    when(col("PRofit")>2000, "High")
    .when(col("Profit")<=0, "Loss")
    .otherwise("Medium")
)
df.select("OrderID","Profit", "ProfitCategory").show()

+-------+------+--------------+
|OrderID|Profit|ProfitCategory|
+-------+------+--------------+
|CA-1001|  5000|          High|
|CA-1002|  1800|        Medium|
|CA-1003|   150|        Medium|
|CA-1004| -1500|          Loss|
|CA-1005|  3000|          High|
+-------+------+--------------+



 8: Drop the SubCategory column
python

In [None]:
df = df.drop("SubCategory")

 9: Handle nulls in Discount → Fill with 0.10

In [None]:
df = df.fillna({"Discount": 0.10})

10: Convert OrderDate to DateType & Extract Year, Month

In [None]:
from pyspark.sql.functions import to_date, year, month

df = df.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))
df = df.withColumn("Year", year(col("OrderDate")))
df = df.withColumn("Month", month(col("OrderDate")))
df.select("OrderID", "OrderDate", "Year", "Month").show()

+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+



# PART 3: Dask DataFrame Operations (Pandas Alternative)

# Step 1: Dask Installation

In [2]:
!pip install dask



# Step 2: Load CSV using Dask


In [5]:
import dask.dataframe as dd

df_dask = dd.read_csv('superstore.csv')
df_dask.head()

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


1. Compute average discount by category

In [6]:
avg_discount = df_dask.groupby("Category")["Discount"].mean().compute()
print(avg_discount)

Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


2. Filter: Orders with more than 1 quantity and high profit

In [9]:
filtered = df_dask[(df_dask["Quantity"]>1)& (df_dask["Profit"]>1000)]
filtered.compute()

Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


3. Save filtered data to a new CSV

In [10]:
filtered.compute().to_csv("filtered_high_profit_orders.csv", index=False)

# PART 4: JSON Handling

Step 1: Create a nested JSON file

In [11]:
[
  {
    "OrderID": "CA-1001",
    "Customer": {"Name": "Ravi", "Segment": "Consumer"},
    "Details": {"Region": "South", "Profit": 5000}
  },
  {
    "OrderID": "CA-1002",
    "Customer": {"Name": "Priya", "Segment": "Corporate"},
    "Details": {"Region": "North", "Profit": 1800}
  }
]


[{'OrderID': 'CA-1001',
  'Customer': {'Name': 'Ravi', 'Segment': 'Consumer'},
  'Details': {'Region': 'South', 'Profit': 5000}},
 {'OrderID': 'CA-1002',
  'Customer': {'Name': 'Priya', 'Segment': 'Corporate'},
  'Details': {'Region': 'North', 'Profit': 1800}}]

In [12]:
json_data = """
[
  {
    "OrderID": "CA-1001",
    "Customer": {"Name": "Ravi", "Segment": "Consumer"},
    "Details": {"Region": "South", "Profit": 5000}
  },
  {
    "OrderID": "CA-1002",
    "Customer": {"Name": "Priya", "Segment": "Corporate"},
    "Details": {"Region": "North", "Profit": 1800}
  }
]
"""

with open("orders.json", "w") as f:
    f.write(json_data)


Step 2: Load JSON using PySpark

In [19]:
df_json = spark.read.json("orders.json", multiLine=True)
df_json.printSchema()

root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)



Step 3: Select nested fields

In [20]:
df_json.select("OrderID", "Customer.Name", "Details.Profit").show()


+-------+-----+------+
|OrderID| Name|Profit|
+-------+-----+------+
|CA-1001| Ravi|  5000|
|CA-1002|Priya|  1800|
+-------+-----+------+

