In [26]:
# Install Dask library for scalable big data processing
!pip install dask[dataframe]

Defaulting to user installation because normal site-packages is not writeable


In [27]:
## Introduction

# Big data analytics focuses on analyzing large datasets using scalable tools. 
# In this notebook, a 10 lakh record Amazon sales dataset is analyzed using Dask to demonstrate parallel. 
# Scalable data processing through efficient aggregation and analysis operations.

In [28]:
# Import Dask DataFrame for big data processing
import dask.dataframe as dd

In [29]:
# Load dataset safely by assuming missing values
df = dd.read_csv(r"Amazon_Sales_10_Lakh_Rows.csv", assume_missing=True)

In [30]:
# Display first few records - DATASET PREVIEW
df.head(5)

Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,B2B,fulfilled-by
0,1.0,405-8078784-5731545,4/30/2022,Cancelled,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0.0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,False,Easy Ship
1,2.0,171-9198151-1101146,4/30/2022,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1.0,INR,406.0,BENGALURU,KARNATAKA,560085.0,IN,False,Easy Ship
2,3.0,404-0687676-7273146,4/30/2022,Shipped,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,1.0,INR,329.0,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,True,
3,4.0,403-9615377-8133951,4/30/2022,Cancelled,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,0.0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,False,Easy Ship
4,5.0,407-1069790-7240320,4/30/2022,Shipped,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,1.0,INR,574.0,CHENNAI,TAMIL NADU,600073.0,IN,False,


In [31]:
# Display last few records
df.tail(5)

Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,B2B,fulfilled-by
499712,999996.0,407-0973793-3892365,6/24/2022,Shipped - Picked Up,Merchant,Amazon.in,Standard,T-shirt,XS,Shipped,1.0,INR,764.0,Kolkata,WEST BENGAL,700104.0,IN,False,Easy Ship
499713,999997.0,404-5253538-0138728,6/24/2022,Cancelled,Amazon,Amazon.in,Expedited,Shirt,M,Cancelled,0.0,,,KANGRA,HIMACHAL PRADESH,176092.0,IN,False,
499714,999998.0,171-3801612-1983557,6/24/2022,Cancelled,Amazon,Amazon.in,Expedited,Shirt,L,Cancelled,0.0,,,dibrugarh,ASSAM,786001.0,IN,False,
499715,999999.0,408-6207072-7206730,6/24/2022,Cancelled,Merchant,Amazon.in,Standard,Shirt,XL,On the Way,0.0,INR,700.95,EAST GODAVARI,ANDHRA PRADESH,533003.0,IN,False,Easy Ship
499716,1000000.0,404-7151110-6924336,6/24/2022,Cancelled,Amazon,Amazon.in,Expedited,T-shirt,XL,Cancelled,0.0,,,NEW DELHI,DELHI,110018.0,IN,False,


In [32]:
# Get total number of rows
total_rows = df.shape[0].compute()

# Get total number of columns
total_columns = len(df.columns)

total_rows, total_columns

(1000000, 19)

In [33]:
# Display all column names
df.columns

Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel',
       'ship-service-level', 'Category', 'Size', 'Courier Status', 'Qty',
       'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code',
       'ship-country', 'B2B', 'fulfilled-by'],
      dtype='object')

In [34]:
# Check data types of each column
df.dtypes

index                         float64
Order ID              string[pyarrow]
Date                  string[pyarrow]
Status                string[pyarrow]
Fulfilment            string[pyarrow]
Sales Channel         string[pyarrow]
ship-service-level    string[pyarrow]
Category              string[pyarrow]
Size                  string[pyarrow]
Courier Status        string[pyarrow]
Qty                           float64
currency              string[pyarrow]
Amount                        float64
ship-city             string[pyarrow]
ship-state            string[pyarrow]
ship-postal-code              float64
ship-country          string[pyarrow]
B2B                              bool
fulfilled-by          string[pyarrow]
dtype: object

In [35]:
# Count missing values per column
null_values = df.isnull().sum()
null_values.compute()

index                      0
Order ID                   0
Date                       0
Status                     0
Fulfilment                 0
Sales Channel              0
ship-service-level         0
Category                   0
Size                       0
Courier Status             0
Qty                        0
currency               60430
Amount                 60430
ship-city                272
ship-state               272
ship-postal-code         272
ship-country             272
B2B                        0
fulfilled-by          694254
dtype: int64

In [36]:
# Replace missing values for analysis consistency
df = df.fillna({
    "Amount": 0,
    "Category": "Unknown",
    "Status": "Unknown"
})

In [37]:
# Again Count missing values per column for Amount analysis
null_values = df.isnull().sum()
null_values.compute()

index                      0
Order ID                   0
Date                       0
Status                     0
Fulfilment                 0
Sales Channel              0
ship-service-level         0
Category                   0
Size                       0
Courier Status             0
Qty                        0
currency               60430
Amount                     0
ship-city                272
ship-state               272
ship-postal-code         272
ship-country             272
B2B                        0
fulfilled-by          694254
dtype: int64

In [38]:
# Generate summary statistics for numerical columns
df.describe().compute()

Unnamed: 0,index,Qty,Amount,ship-postal-code
count,1000000.0,1000000.0,1000000.0,999728.0
mean,500000.5,0.904326,609.03187,463854.474179
std,288675.278932,0.313654,313.101963,191577.62707
min,1.0,0.0,0.0,110001.0
25%,125071.5,1.0,411.43,382421.0
50%,375212.5,1.0,582.0,500032.0
75%,625213.0,1.0,771.0,600024.0
max,1000000.0,15.0,5584.0,989898.0


In [39]:
# Check number of partitions (parallelism)
df.npartitions

2

In [40]:
# Increase number of partitions to scale computation
df_scaled = df.repartition(npartitions=16)

In [41]:
# Verify number of partitions after scaling
df_scaled.npartitions

16

In [42]:
# Calculate total sales amount
total_sales = df["Amount"].sum()
total_sales.compute()

609031870.1800001

In [43]:
# Count unique values in important columns
df["Category"].nunique().compute(), df["Status"].nunique().compute()

(9, 13)

In [44]:
# Perform aggregation on scaled dataset
scaled_category_sales = df_scaled.groupby("Category")["Amount"].sum()
scaled_category_sales.compute()

Category
Perfume     6.097325e+06
T-shirt     3.044432e+08
Trousers    4.148701e+07
Blazzer     8.652609e+07
Shirt       1.647410e+08
Shoes       9.735052e+05
Socks       1.179316e+06
Wallet      3.577076e+06
Watch       7.320000e+03
Name: Amount, dtype: float64

In [45]:
# Calculate total sales per category
category_sales = df.groupby("Category")["Amount"].sum()
category_sales.compute().sort_values(ascending=False)

Category
T-shirt     3.044432e+08
Shirt       1.647410e+08
Blazzer     8.652609e+07
Trousers    4.148701e+07
Perfume     6.097325e+06
Wallet      3.577076e+06
Socks       1.179316e+06
Shoes       9.735052e+05
Watch       7.320000e+03
Name: Amount, dtype: float64

In [46]:
# Count number of orders by status
status_count = df.groupby("Status").size()
status_count.compute().sort_values(ascending=False)

Status
Shipped                          602173
Shipped - Delivered to Buyer     223866
Cancelled                        142077
Shipped - Returned to Seller      15280
Shipped - Picked Up                7619
Pending                            5247
Pending - Waiting for Pick Up      2248
Shipped - Returning to Seller      1048
Shipped - Out for Delivery          254
Shipped - Rejected by Buyer          85
Shipping                             56
Shipped - Lost in Transit            40
Shipped - Damaged                     7
dtype: int64

In [47]:
# Compute average sales for each category
avg_sales = df.groupby("Category")["Amount"].mean()
avg_sales.compute()

Category
Blazzer     722.791446
Perfume     682.485405
Shirt       426.734896
Shoes       755.827003
Socks       342.724871
T-shirt     779.484395
Trousers    501.444541
Wallet      493.594046
Watch       305.000000
Name: Amount, dtype: float64

In [48]:
# Identify top 5 categories based on total sales
top_products = df.groupby("Category")["Amount"].sum()
top_products.compute().round(0).astype(int).sort_values(ascending=False).head(5)

Category
T-shirt     304443220
Shirt       164741007
Blazzer      86526087
Trousers     41487014
Perfume       6097325
Name: Amount, dtype: int32

In [49]:
# Trigger full parallel execution across partitions
df_scaled.map_partitions(lambda x: x).compute()

Unnamed: 0,index,Order ID,Date,Status,Fulfilment,Sales Channel,ship-service-level,Category,Size,Courier Status,Qty,currency,Amount,ship-city,ship-state,ship-postal-code,ship-country,B2B,fulfilled-by
0,1.0,405-8078784-5731545,4/30/2022,Cancelled,Merchant,Amazon.in,Standard,T-shirt,S,On the Way,0.0,INR,647.62,MUMBAI,MAHARASHTRA,400081.0,IN,False,Easy Ship
1,2.0,171-9198151-1101146,4/30/2022,Shipped - Delivered to Buyer,Merchant,Amazon.in,Standard,Shirt,3XL,Shipped,1.0,INR,406.00,BENGALURU,KARNATAKA,560085.0,IN,False,Easy Ship
2,3.0,404-0687676-7273146,4/30/2022,Shipped,Amazon,Amazon.in,Expedited,Shirt,XL,Shipped,1.0,INR,329.00,NAVI MUMBAI,MAHARASHTRA,410210.0,IN,True,
3,4.0,403-9615377-8133951,4/30/2022,Cancelled,Merchant,Amazon.in,Standard,Blazzer,L,On the Way,0.0,INR,753.33,PUDUCHERRY,PUDUCHERRY,605008.0,IN,False,Easy Ship
4,5.0,407-1069790-7240320,4/30/2022,Shipped,Amazon,Amazon.in,Expedited,Trousers,3XL,Shipped,1.0,INR,574.00,CHENNAI,TAMIL NADU,600073.0,IN,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499712,999996.0,407-0973793-3892365,6/24/2022,Shipped - Picked Up,Merchant,Amazon.in,Standard,T-shirt,XS,Shipped,1.0,INR,764.00,Kolkata,WEST BENGAL,700104.0,IN,False,Easy Ship
499713,999997.0,404-5253538-0138728,6/24/2022,Cancelled,Amazon,Amazon.in,Expedited,Shirt,M,Cancelled,0.0,,0.00,KANGRA,HIMACHAL PRADESH,176092.0,IN,False,
499714,999998.0,171-3801612-1983557,6/24/2022,Cancelled,Amazon,Amazon.in,Expedited,Shirt,L,Cancelled,0.0,,0.00,dibrugarh,ASSAM,786001.0,IN,False,
499715,999999.0,408-6207072-7206730,6/24/2022,Cancelled,Merchant,Amazon.in,Standard,Shirt,XL,On the Way,0.0,INR,700.95,EAST GODAVARI,ANDHRA PRADESH,533003.0,IN,False,Easy Ship


In [50]:
# Scalability Check:

# - Dataset repartitioned into multiple chunks
# - Computation executed in parallel
# - Memory persistence improves performance
# - Demonstrates scalable big data processing using Dask