# Step 1: Sample Dataset (create CSV)

In [1]:
superstore_data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000
"""

with open("superstore.csv", "w") as f:
    f.write(superstore_data)


# PART 1: Pandas DataFrame Operations
1. Load the CSV using pandas .

In [2]:
import pandas as pd

df = pd.read_csv("superstore.csv")

2. Print schema, head, shape, dtypes.

In [3]:
print(df.head())
print(df.shape)
print(df.dtypes)

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  
4       Technology       Phones         2      20000      0.00    3000  
(5, 12)
OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         object
Category        object
Su

3. Select Customer , Product , Profit columns.

In [4]:
print(df[['Customer', 'Product', 'Profit']])

  Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


4. Filter orders where Profit > 2000 and Discount = 0 .

In [5]:
filtered = df[(df['Profit'] > 2000) & (df['Discount'] == 0.0)]
print(filtered)

   OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


5. Sort by Profit descending.

In [6]:
print(df.sort_values(by='Profit', ascending=False))

   OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


6. GroupBy Category → Total Profit, Avg Discount.

In [7]:
print(df.groupby('Category')[['Profit', 'Discount']].agg({'Profit': 'sum', 'Discount': 'mean'}))

                 Profit  Discount
Category                         
Furniture         -1500  0.200000
Office Supplies     150  0.050000
Technology         9800  0.083333


7. Add a column TotalPrice = Quantity * UnitPrice .

In [8]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
print(df[['OrderID', 'TotalPrice']])

   OrderID  TotalPrice
0  CA-1001       55000
1  CA-1002       24000
2  CA-1003         600
3  CA-1004       18000
4  CA-1005       40000


8. Drop the SubCategory column.

In [9]:
df.drop('SubCategory', axis=1, inplace=True)

9. Fill nulls in Discount with 0.10.

In [11]:
df['Discount'] = df['Discount'].fillna(0.10)


10. Apply a function to categorize orders:

In [12]:
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

df['ProfitCategory'] = df.apply(classify, axis=1)
print(df[['OrderID', 'ProfitCategory']])

   OrderID ProfitCategory
0  CA-1001           High
1  CA-1002         Medium
2  CA-1003         Medium
3  CA-1004            Low
4  CA-1005         Medium


#PART 2: PySpark DataFrame Operations
1. Load the same CSV using PySpark.

In [13]:
from pyspark.sql import SparkSession
# Create Spark session

spark = SparkSession.builder \
.appName("PySparkBasics1") \
.getOrCreate()

# Confirm Spark is running

spark

In [14]:
superstore_data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000
"""

with open("superstore.csv", "w") as f:
    f.write(superstore_data)


2. Show schema and first 5 rows.

In [15]:
df_spark = spark.read.csv("superstore.csv", header=True, inferSchema=True)
df_spark.printSchema()
df_spark.show(5)


root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

3. Select columns, Rename Customer → Client .

In [19]:

from pyspark.sql.functions import col, when, year, month
df_spark.select(col("Customer").alias("Client"), "Product", "Profit").show()

+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



4. Filter Segment = 'Consumer' and Profit < 1000 .

In [20]:
df_spark.filter((col("Segment") == "Consumer") & (col("Profit") < 1000)).show()


+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



5. GroupBy Region and show average profit.

In [21]:
df_spark.groupBy("Region").avg("Profit").withColumnRenamed("avg(Profit)", "AvgProfit").show()


+------+---------+
|Region|AvgProfit|
+------+---------+
| South|   4000.0|
|  East|    150.0|
|  West|  -1500.0|
| North|   1800.0|
+------+---------+



6. Use withColumn to create TotalPrice = Quantity * UnitPrice .

In [22]:
df_spark = df_spark.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))


7. Use when().otherwise() to classify Profit as:

In [23]:
df_spark = df_spark.withColumn("ProfitLevel", when(col("Profit") > 2000, "High")
                                .when(col("Profit") <= 0, "Loss")
                                .otherwise("Medium"))
df_spark.select("OrderID", "Profit", "ProfitLevel").show()


+-------+------+-----------+
|OrderID|Profit|ProfitLevel|
+-------+------+-----------+
|CA-1001|  5000|       High|
|CA-1002|  1800|     Medium|
|CA-1003|   150|     Medium|
|CA-1004| -1500|       Loss|
|CA-1005|  3000|       High|
+-------+------+-----------+



8. Use drop() to remove SubCategory .

In [24]:
df_spark = df_spark.drop("SubCategory")


9. Handle nulls in Discount using fillna(0.10) .

In [25]:
df_spark = df_spark.fillna({"Discount": 0.10})


10. Convert OrderDate to date type and extract year , month .

In [26]:
from pyspark.sql.functions import to_date

df_spark = df_spark.withColumn("OrderDate", to_date("OrderDate", "yyyy-MM-dd"))
df_spark = df_spark.withColumn("OrderYear", year("OrderDate")).withColumn("OrderMonth", month("OrderDate"))
df_spark.select("OrderDate", "OrderYear", "OrderMonth").show()


+----------+---------+----------+
| OrderDate|OrderYear|OrderMonth|
+----------+---------+----------+
|2023-01-15|     2023|         1|
|2023-02-20|     2023|         2|
|2023-01-25|     2023|         1|
|2023-03-01|     2023|         3|
|2023-02-05|     2023|         2|
+----------+---------+----------+



# PART 3: Dask DataFrame Operations (Pandas Alternative)
1. Install Dask:

In [27]:
!pip install dask
import dask.dataframe as dd



2. Load the same superstore.csv :

In [28]:
df_dask = dd.read_csv("superstore.csv")

3. Do the following:

Compute average discount by category.

Filter orders with more than 1 quantity and high profit.

Save filtered data to new CSV.

In [29]:
# 1. Avg Discount by Category
print(df_dask.groupby("Category")["Discount"].mean().compute())

# 2. Filter orders with quantity > 1 and profit > 1000
filtered = df_dask[(df_dask["Quantity"] > 1) & (df_dask["Profit"] > 1000)]
print(filtered.compute())

# 3. Save to CSV
filtered.to_csv("filtered_dask_orders.csv", index=False, single_file=True)

Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64
   OrderID   OrderDate Customer    Segment Region  Product    Category  \
1  CA-1002  2023-02-20    Priya  Corporate  North  Printer  Technology   
4  CA-1005  2023-02-05    Divya   Consumer  South    Phone  Technology   

   SubCategory  Quantity  UnitPrice  Discount  Profit  
1  Peripherals         2      12000      0.15    1800  
4       Phones         2      20000      0.00    3000  


['/content/filtered_dask_orders.csv']

# PART 4: JSON Handling (Complex Nested)
1. Create a nested JSON file:

In [30]:
import json

data_json = [
    {
        "OrderID": "CA-1001",
        "Customer": {"Name": "Ravi", "Segment": "Consumer"},
        "Details": {"Region": "South", "Profit": 5000}
    },
    {
        "OrderID": "CA-1002",
        "Customer": {"Name": "Priya", "Segment": "Corporate"},
        "Details": {"Region": "North", "Profit": 1800}
    }
]

with open("orders.json", "w") as f:
    json.dump(data_json, f, indent=2)


2. Load it using PySpark:

In [31]:
df_json = spark.read.json("orders.json", multiLine=True)
df_json.printSchema()

df_json.select("OrderID", "Customer.Name", "Details.Profit").show()


root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)

+-------+-----+------+
|OrderID| Name|Profit|
+-------+-----+------+
|CA-1001| Ravi|  5000|
|CA-1002|Priya|  1800|
+-------+-----+------+

