# Master DataFrame Assignment – Retail Sales Superstore Dataset (All-In-One)

##  Sample Dataset (create CSV)

In [4]:
# Save superstore.csv
data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000
"""

with open("superstore.csv", "w") as file:
    file.write(data)

print("CSV file 'superstore.csv' created successfully.")

CSV file 'superstore.csv' created successfully.


### Download the 'superstore.csv' file

In [5]:
from google.colab import files
files.download('superstore.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## TASKS ACROSS Pandas, PySpark, and Dask

## PART 1: Pandas DataFrame Operations

#### 1. Load the CSV using pandas .

In [6]:
import pandas as pd

df = pd.read_csv('superstore.csv')

#### 2. Print schema, head, shape, dtypes.

In [8]:
# For detailed schema-like view in Pandas
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   OrderID      5 non-null      object 
 1   OrderDate    5 non-null      object 
 2   Customer     5 non-null      object 
 3   Segment      5 non-null      object 
 4   Region       5 non-null      object 
 5   Product      5 non-null      object 
 6   Category     5 non-null      object 
 7   SubCategory  5 non-null      object 
 8   Quantity     5 non-null      int64  
 9   UnitPrice    5 non-null      int64  
 10  Discount     5 non-null      float64
 11  Profit       5 non-null      int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 612.0+ bytes
None


In [9]:
print(df.head())

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  


In [10]:
print(df.shape)

(5, 12)


In [11]:
print(df.dtypes)

OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         object
Category        object
SubCategory     object
Quantity         int64
UnitPrice        int64
Discount       float64
Profit           int64
dtype: object


#### 3. Select Customer , Product , Profit columns.

In [14]:
print(df[["Customer", "Product", "Profit"]])

  Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


#### 4. Filter orders where Profit > 2000 and Discount = 0 .

In [15]:
print(df[(df["Profit"] > 2000) & (df["Discount"] == 0)])

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


#### 5. Sort by Profit descending.

In [16]:
print(df.sort_values(by = "Profit", ascending = False))

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


#### 6. GroupBy Category → Total Profit, Avg Discount.

In [17]:
print(df.groupby("Category").agg({"Profit": "sum", "Discount": "mean"}))

                 Profit  Discount
Category                         
Furniture         -1500  0.200000
Office Supplies     150  0.050000
Technology         9800  0.083333


#### 7. Add a column TotalPrice = Quantity * UnitPrice .

In [18]:
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  \
0       Technology    Computers         1      55000      0.10    5000   
1       Technology  Peripherals         2      12000      0.15    1800   
2  Office Supplies        Paper         3        200      0.05     150   
3        Furniture       Tables         1      18000      0.20   -1500   
4       Technology       Phones         2      20000      0.00    3000   

   TotalPrice  
0       55000  
1       24000  
2         600  
3       18000  
4       40000  


#### 8. Drop the SubCategory column.

In [19]:
df.drop("SubCategory", axis = 1, inplace=True)
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


#### 9. Fill nulls in Discount with 0.10.

In [21]:
df["Discount"] = df["Discount"].fillna(0.10)
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  
0       Technology         1      55000      0.10    5000       55000  
1       Technology         2      12000      0.15    1800       24000  
2  Office Supplies         3        200      0.05     150         600  
3        Furniture         1      18000      0.20   -1500       18000  
4       Technology         2      20000      0.00    3000       40000  


#### 10. Apply a function to categorize orders:

In [23]:
def classify(row):
  if row['Profit'] > 4000:
    return 'High'
  elif row['Profit'] > 0:
    return 'Medium'
  else:
    return 'Low'

df["OrderClass"] = df.apply(classify, axis = 1)
print(df)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  Quantity  UnitPrice  Discount  Profit  TotalPrice  \
0       Technology         1      55000      0.10    5000       55000   
1       Technology         2      12000      0.15    1800       24000   
2  Office Supplies         3        200      0.05     150         600   
3        Furniture         1      18000      0.20   -1500       18000   
4       Technology         2      20000      0.00    3000       40000   

  OrderClass  
0       High  
1     Medium  
2     Medium  
3        Low  
4     Medium  


## PART 2: PySpark DataFrame Operations

#### 1. Load the same CSV using PySpark.

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, year, month

spark = SparkSession.builder.appName("RetailSales").getOrCreate()
df_spark = spark.read.csv("superstore.csv", header=True, inferSchema=True)

####2. Show schema and first 5 rows.

In [25]:
df_spark.printSchema()
df_spark.show(5)

root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

####3. Select columns, Rename Customer → Client .

In [26]:
df_spark.select(col("Customer").alias("Client"), "Product", "Profit").show()

+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



####4. Filter Segment = 'Consumer' and Profit < 1000 .

In [27]:
df_spark.filter((col("Segment") == "Consumer") & (col("Profit") < 1000)).show()

+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



####5. GroupBy Region and show average profit.

In [28]:
df_spark.groupBy("Region").avg("Profit").show()

+------+-----------+
|Region|avg(Profit)|
+------+-----------+
| South|     4000.0|
|  East|      150.0|
|  West|    -1500.0|
| North|     1800.0|
+------+-----------+



####6. Use withColumn to create TotalPrice = Quantity * UnitPrice .

In [30]:
df_spark = df_spark.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df_spark.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|TotalPrice|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|     55000|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|     24000|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|       600|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|     18000|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|   

####7. Use when().otherwise() to classify Profit as:

'Profit' > 2000 → 'High'

'Profit' <= 0 → 'Loss'

else 'Medium'

In [31]:
df_spark = df_spark.withColumn(
    "ProfitClass", when(col("Profit") > 2000, "High")
    .when(col("Profit") <= 0, "Loss")
    .otherwise("Medium")
)
df_spark.show()

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+-----------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|TotalPrice|ProfitClass|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+----------+-----------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|     55000|       High|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|Peripherals|       2|    12000|    0.15|  1800|     24000|     Medium|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|       600|     Medium|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|     Tables|       1|    18000|     0.2| -1500|     18000|     

####8. Use drop() to remove SubCategory .

In [33]:
df_spark = df_spark.drop("SubCategory")
df_spark.show()

+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+-----------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|Quantity|UnitPrice|Discount|Profit|TotalPrice|ProfitClass|
+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+-----------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|       1|    55000|     0.1|  5000|     55000|       High|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|       2|    12000|    0.15|  1800|     24000|     Medium|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|       3|      200|    0.05|   150|       600|     Medium|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|       1|    18000|     0.2| -1500|     18000|       Loss|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|   

####9. Handle nulls in Discount using fillna(0.10) .

In [35]:
df_spark = df_spark.fillna({"Discount": 0.10})
df_spark.show()

+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+-----------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|Quantity|UnitPrice|Discount|Profit|TotalPrice|ProfitClass|
+-------+----------+--------+-----------+------+--------+---------------+--------+---------+--------+------+----------+-----------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|       1|    55000|     0.1|  5000|     55000|       High|
|CA-1002|2023-02-20|   Priya|  Corporate| North| Printer|     Technology|       2|    12000|    0.15|  1800|     24000|     Medium|
|CA-1003|2023-01-25|    Amit|   Consumer|  East|Notebook|Office Supplies|       3|      200|    0.05|   150|       600|     Medium|
|CA-1004|2023-03-01|   Anita|Home Office|  West|   Table|      Furniture|       1|    18000|     0.2| -1500|     18000|       Loss|
|CA-1005|2023-02-05|   Divya|   Consumer| South|   Phone|     Technology|   

####10. Convert OrderDate to date type and extract year , month .

In [36]:
from pyspark.sql.functions import to_date

df_spark = df_spark.withColumn("OrderDate", to_date("OrderDate"))
df_spark = df_spark.withColumn("Year", year("OrderDate"))
df_spark = df_spark.withColumn("Month", month("OrderDate"))
df_spark.select("OrderID", "OrderDate", "Year", "Month").show()

+-------+----------+----+-----+
|OrderID| OrderDate|Year|Month|
+-------+----------+----+-----+
|CA-1001|2023-01-15|2023|    1|
|CA-1002|2023-02-20|2023|    2|
|CA-1003|2023-01-25|2023|    1|
|CA-1004|2023-03-01|2023|    3|
|CA-1005|2023-02-05|2023|    2|
+-------+----------+----+-----+



##PART 3: Dask DataFrame Operations (Pandas Alternative)

####1. Install Dask:

In [37]:
!pip install dask



####2. Load the same superstore.csv :

In [38]:
import dask.dataframe as dd
df_dask = dd.read_csv('superstore.csv')

####3. Do the following:

In [39]:
# Compute average discount by category.
print(df_dask.groupby("Category")["Discount"].mean().compute())

# Filter orders with more than 1 quantity and high profit.
filtered = df_dask[(df_dask["Quantity"] > 1) & (df_dask["Profit"] > 2000)]

# Save filtered data to new CSV.
filtered.to_csv("filtered_high_profit_orders.csv", single_file=True)

Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


['/content/filtered_high_profit_orders.csv']

In [40]:
# To download the "filtered_high_profit_orders.csv" csv file
from google.colab import files
files.download("filtered_high_profit_orders.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## PART 4: JSON Handling (Complex Nested)

####1. Create a nested JSON file:

In [41]:
# Step 1: Define the nested JSON data
import json

orders = [
    {
        "OrderID": "CA-1001",
        "Customer": {"Name": "Ravi", "Segment": "Consumer"},
        "Details": {"Region": "South", "Profit": 5000}
    },
    {
        "OrderID": "CA-1002",
        "Customer": {"Name": "Priya", "Segment": "Corporate"},
        "Details": {"Region": "North", "Profit": 1800}
    }
]

In [42]:
# Step 2: Write it to a JSON file
with open("orders.json", "w") as f:
    json.dump(orders, f, indent=4)

In [43]:
# Step 3: Download the JSON file to local computer
from google.colab import files
files.download("orders.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

####2. Load it using PySpark:

In [44]:
df_json = spark.read.json('orders.json', multiLine=True)
df_json.printSchema()
df_json.select("OrderID", "Customer.Name", "Details.Profit").show

root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)

