In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('Varun').getOrCreate()

In [8]:
spark

In [4]:
import random
import datetime
import csv

categories = ["Electronics", "Furniture", "Clothing", "Appliances", "Books"]
products = {
    "Electronics": ["Laptop", "Smartphone", "Tablet", "Camera"],
    "Furniture": ["Chair", "Table", "Sofa", "Bed"],
    "Clothing": ["Shirt", "Jeans", "Jacket", "Sweater"],
    "Appliances": ["Blender", "Refrigerator", "Microwave", "Toaster"],
    "Books": ["Fiction", "Non-Fiction", "Biography", "Comics"],
}

rows = []
for i in range(1, 1001):
    category = random.choice(categories)
    product = random.choice(products[category])
    price = round(random.uniform(10, 1000), 2)
    quantity = random.randint(1, 10)
    order_date = datetime.datetime.now() - datetime.timedelta(days=random.randint(0, 365))
    rows.append([i, product, category, price, quantity, order_date.strftime("%Y-%m-%d")])

# Saving to CSV
with open("ecommerce_data.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["OrderID", "Product", "Category", "Price", "Quantity", "OrderDate"])
    writer.writerows(rows)

print("E-Commerce dataset saved as 'ecommerce_data.csv'")


E-Commerce dataset saved as 'ecommerce_data.csv'


In [9]:
df=spark.read.csv('ecommerce_data.csv',header=True,inferSchema=True)

In [6]:
df.show()

+-------+----------+-----------+------+--------+----------+
|OrderID|   Product|   Category| Price|Quantity| OrderDate|
+-------+----------+-----------+------+--------+----------+
|      1|     Jeans|   Clothing|817.51|       4|2023-12-25|
|      2|    Jacket|   Clothing|240.93|       5|2024-09-28|
|      3|Smartphone|Electronics|989.39|       2|2024-05-04|
|      4|Smartphone|Electronics|708.13|       7|2024-11-10|
|      5|     Chair|  Furniture|907.95|       5|2024-04-03|
|      6|    Laptop|Electronics|375.26|       7|2024-11-04|
|      7|     Chair|  Furniture|102.13|      10|2024-10-26|
|      8| Microwave| Appliances|881.43|       9|2024-12-22|
|      9|    Comics|      Books|672.55|      10|2024-04-22|
|     10| Microwave| Appliances|675.32|       6|2024-02-15|
|     11|   Blender| Appliances|277.57|       3|2024-02-26|
|     12|     Chair|  Furniture|281.11|       6|2024-11-11|
|     13|       Bed|  Furniture|278.03|       7|2024-12-16|
|     14|   Sweater|   Clothing|152.02| 

In [7]:
df.head()

Row(OrderID=1, Product='Jeans', Category='Clothing', Price=817.51, Quantity=4, OrderDate=datetime.date(2023, 12, 25))

In [8]:
df.collect()

[Row(OrderID=1, Product='Jeans', Category='Clothing', Price=817.51, Quantity=4, OrderDate=datetime.date(2023, 12, 25)),
 Row(OrderID=2, Product='Jacket', Category='Clothing', Price=240.93, Quantity=5, OrderDate=datetime.date(2024, 9, 28)),
 Row(OrderID=3, Product='Smartphone', Category='Electronics', Price=989.39, Quantity=2, OrderDate=datetime.date(2024, 5, 4)),
 Row(OrderID=4, Product='Smartphone', Category='Electronics', Price=708.13, Quantity=7, OrderDate=datetime.date(2024, 11, 10)),
 Row(OrderID=5, Product='Chair', Category='Furniture', Price=907.95, Quantity=5, OrderDate=datetime.date(2024, 4, 3)),
 Row(OrderID=6, Product='Laptop', Category='Electronics', Price=375.26, Quantity=7, OrderDate=datetime.date(2024, 11, 4)),
 Row(OrderID=7, Product='Chair', Category='Furniture', Price=102.13, Quantity=10, OrderDate=datetime.date(2024, 10, 26)),
 Row(OrderID=8, Product='Microwave', Category='Appliances', Price=881.43, Quantity=9, OrderDate=datetime.date(2024, 12, 22)),
 Row(OrderID=9, 

In [9]:
df.take(5)

[Row(OrderID=1, Product='Laptop', Category='Electronics', Price=569.87, Quantity=3, OrderDate=datetime.date(2024, 2, 16)),
 Row(OrderID=2, Product='Smartphone', Category='Electronics', Price=825.38, Quantity=7, OrderDate=datetime.date(2024, 9, 10)),
 Row(OrderID=3, Product='Jacket', Category='Clothing', Price=97.82, Quantity=1, OrderDate=datetime.date(2024, 8, 21)),
 Row(OrderID=4, Product='Refrigerator', Category='Appliances', Price=798.19, Quantity=10, OrderDate=datetime.date(2024, 2, 3)),
 Row(OrderID=5, Product='Tablet', Category='Electronics', Price=684.55, Quantity=7, OrderDate=datetime.date(2024, 1, 1))]

In [10]:
df.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [9]:
#all orders with a price greater than 100.
df[df['Price']>100].show()

+-------+----------+-----------+------+--------+----------+
|OrderID|   Product|   Category| Price|Quantity| OrderDate|
+-------+----------+-----------+------+--------+----------+
|      1|     Jeans|   Clothing|817.51|       4|2023-12-25|
|      2|    Jacket|   Clothing|240.93|       5|2024-09-28|
|      3|Smartphone|Electronics|989.39|       2|2024-05-04|
|      4|Smartphone|Electronics|708.13|       7|2024-11-10|
|      5|     Chair|  Furniture|907.95|       5|2024-04-03|
|      6|    Laptop|Electronics|375.26|       7|2024-11-04|
|      7|     Chair|  Furniture|102.13|      10|2024-10-26|
|      8| Microwave| Appliances|881.43|       9|2024-12-22|
|      9|    Comics|      Books|672.55|      10|2024-04-22|
|     10| Microwave| Appliances|675.32|       6|2024-02-15|
|     11|   Blender| Appliances|277.57|       3|2024-02-26|
|     12|     Chair|  Furniture|281.11|       6|2024-11-11|
|     13|       Bed|  Furniture|278.03|       7|2024-12-16|
|     14|   Sweater|   Clothing|152.02| 

In [10]:
#sorting
df.orderBy('Price',ascending=False).show()

+-------+------------+-----------+------+--------+----------+
|OrderID|     Product|   Category| Price|Quantity| OrderDate|
+-------+------------+-----------+------+--------+----------+
|    499|  Smartphone|Electronics|997.76|       3|2024-05-07|
|    874|Refrigerator| Appliances|996.87|       8|2024-11-09|
|     78|     Toaster| Appliances|995.74|       9|2024-11-14|
|    994|      Tablet|Electronics|993.48|       7|2024-10-05|
|    529|      Jacket|   Clothing|992.11|       6|2024-10-16|
|    622|      Laptop|Electronics|991.85|       5|2024-11-16|
|     21|      Jacket|   Clothing|991.49|       3|2024-12-11|
|    662| Non-Fiction|      Books| 991.4|       7|2024-10-09|
|    738|      Camera|Electronics|989.52|       5|2024-11-05|
|      3|  Smartphone|Electronics|989.39|       2|2024-05-04|
|    129|     Toaster| Appliances|987.91|       8|2024-01-13|
|    222|       Chair|  Furniture| 987.8|       1|2024-08-07|
|    686|   Biography|      Books|987.14|       6|2024-02-20|
|    523

In [14]:
#total sales
from pyspark.sql.functions import round
df=df.withColumn('Total',round(df['Price']*df['Quantity'],2))

In [15]:
df.show()

+-------+----------+-----------+------+--------+----------+-------+
|OrderID|   Product|   Category| Price|Quantity| OrderDate|  Total|
+-------+----------+-----------+------+--------+----------+-------+
|      1|     Jeans|   Clothing|817.51|       4|2023-12-25|3270.04|
|      2|    Jacket|   Clothing|240.93|       5|2024-09-28|1204.65|
|      3|Smartphone|Electronics|989.39|       2|2024-05-04|1978.78|
|      4|Smartphone|Electronics|708.13|       7|2024-11-10|4956.91|
|      5|     Chair|  Furniture|907.95|       5|2024-04-03|4539.75|
|      6|    Laptop|Electronics|375.26|       7|2024-11-04|2626.82|
|      7|     Chair|  Furniture|102.13|      10|2024-10-26| 1021.3|
|      8| Microwave| Appliances|881.43|       9|2024-12-22|7932.87|
|      9|    Comics|      Books|672.55|      10|2024-04-22| 6725.5|
|     10| Microwave| Appliances|675.32|       6|2024-02-15|4051.92|
|     11|   Blender| Appliances|277.57|       3|2024-02-26| 832.71|
|     12|     Chair|  Furniture|281.11|       6|

In [10]:
#average order price
import pyspark.sql.functions  as F
df.agg(F.avg('Price')).show()

+-----------------+
|       avg(Price)|
+-----------------+
|514.5838100000004|
+-----------------+



In [20]:
#total revenue
df.agg(round(F.sum('Total'),2)).show()

+--------------------+
|round(sum(Total), 2)|
+--------------------+
|          2926225.15|
+--------------------+



In [21]:
#count of each category sold
df.groupBy('Category').agg(F.count('Quantity')).show()

+-----------+---------------+
|   Category|count(Quantity)|
+-----------+---------------+
|Electronics|            197|
|   Clothing|            204|
|      Books|            203|
|  Furniture|            203|
| Appliances|            193|
+-----------+---------------+



In [31]:
#Extracting year, month, and day from a orderdate and count occurrences for each month.
from pyspark.sql.functions import year,month,day
df=df.withColumn('Year',year('OrderDate')).withColumn('Month',month('OrderDate')).withColumn('Day',day('OrderDate'))

In [32]:
df.show()

+-------+----------+-----------+------+--------+----------+-------+----+-----+---+
|OrderID|   Product|   Category| Price|Quantity| OrderDate|  Total|Year|Month|Day|
+-------+----------+-----------+------+--------+----------+-------+----+-----+---+
|      1|     Jeans|   Clothing|817.51|       4|2023-12-25|3270.04|2023|   12| 25|
|      2|    Jacket|   Clothing|240.93|       5|2024-09-28|1204.65|2024|    9| 28|
|      3|Smartphone|Electronics|989.39|       2|2024-05-04|1978.78|2024|    5|  4|
|      4|Smartphone|Electronics|708.13|       7|2024-11-10|4956.91|2024|   11| 10|
|      5|     Chair|  Furniture|907.95|       5|2024-04-03|4539.75|2024|    4|  3|
|      6|    Laptop|Electronics|375.26|       7|2024-11-04|2626.82|2024|   11|  4|
|      7|     Chair|  Furniture|102.13|      10|2024-10-26| 1021.3|2024|   10| 26|
|      8| Microwave| Appliances|881.43|       9|2024-12-22|7932.87|2024|   12| 22|
|      9|    Comics|      Books|672.55|      10|2024-04-22| 6725.5|2024|    4| 22|
|   

In [30]:
#counting occurrences for each month.
df.groupby('month').agg(F.count('Month')).show()

+-----+------------+
|month|count(Month)|
+-----+------------+
|   12|          81|
|    1|          79|
|    6|          84|
|    3|          83|
|    5|          76|
|    9|          86|
|    4|          78|
|    8|          83|
|    7|          84|
|   10|          86|
|   11|         102|
|    2|          78|
+-----+------------+



In [38]:
#For each category, rank the products by their sales revenue.
from pyspark.sql import Window
wind=Window.partitionBy('Category').orderBy(F.col('Total').desc())

In [39]:
df.withColumn('Rank',F.rank().over(wind)).show()

+-------+------------+----------+------+--------+----------+-------+----+-----+---+----+
|OrderID|     Product|  Category| Price|Quantity| OrderDate|  Total|Year|Month|Day|Rank|
+-------+------------+----------+------+--------+----------+-------+----+-----+---+----+
|    974|Refrigerator|Appliances|972.35|      10|2024-06-19| 9723.5|2024|    6| 19|   1|
|    780|Refrigerator|Appliances|947.03|      10|2024-07-26| 9470.3|2024|    7| 26|   2|
|    823|Refrigerator|Appliances|909.97|      10|2024-12-06| 9099.7|2024|   12|  6|   3|
|     78|     Toaster|Appliances|995.74|       9|2024-11-14|8961.66|2024|   11| 14|   4|
|    451|Refrigerator|Appliances|877.33|      10|2024-05-12| 8773.3|2024|    5| 12|   5|
|    116|     Blender|Appliances|839.46|      10|2024-10-10| 8394.6|2024|   10| 10|   6|
|    929|Refrigerator|Appliances|817.99|      10|2024-05-09| 8179.9|2024|    5|  9|   7|
|    390|   Microwave|Appliances|887.48|       9|2024-08-06|7987.32|2024|    8|  6|   8|
|    874|Refrigerator

In [44]:
#drop row with na if any
df.na.drop(how='any',subset=['Product','Total']).show()

+-------+----------+-----------+------+--------+----------+-------+----+-----+---+
|OrderID|   Product|   Category| Price|Quantity| OrderDate|  Total|Year|Month|Day|
+-------+----------+-----------+------+--------+----------+-------+----+-----+---+
|      1|     Jeans|   Clothing|817.51|       4|2023-12-25|3270.04|2023|   12| 25|
|      2|    Jacket|   Clothing|240.93|       5|2024-09-28|1204.65|2024|    9| 28|
|      3|Smartphone|Electronics|989.39|       2|2024-05-04|1978.78|2024|    5|  4|
|      4|Smartphone|Electronics|708.13|       7|2024-11-10|4956.91|2024|   11| 10|
|      5|     Chair|  Furniture|907.95|       5|2024-04-03|4539.75|2024|    4|  3|
|      6|    Laptop|Electronics|375.26|       7|2024-11-04|2626.82|2024|   11|  4|
|      7|     Chair|  Furniture|102.13|      10|2024-10-26| 1021.3|2024|   10| 26|
|      8| Microwave| Appliances|881.43|       9|2024-12-22|7932.87|2024|   12| 22|
|      9|    Comics|      Books|672.55|      10|2024-04-22| 6725.5|2024|    4| 22|
|   

In [43]:
df.show()

+-------+----------+-----------+------+--------+----------+-------+----+-----+---+
|OrderID|   Product|   Category| Price|Quantity| OrderDate|  Total|Year|Month|Day|
+-------+----------+-----------+------+--------+----------+-------+----+-----+---+
|      1|     Jeans|   Clothing|817.51|       4|2023-12-25|3270.04|2023|   12| 25|
|      2|    Jacket|   Clothing|240.93|       5|2024-09-28|1204.65|2024|    9| 28|
|      3|Smartphone|Electronics|989.39|       2|2024-05-04|1978.78|2024|    5|  4|
|      4|Smartphone|Electronics|708.13|       7|2024-11-10|4956.91|2024|   11| 10|
|      5|     Chair|  Furniture|907.95|       5|2024-04-03|4539.75|2024|    4|  3|
|      6|    Laptop|Electronics|375.26|       7|2024-11-04|2626.82|2024|   11|  4|
|      7|     Chair|  Furniture|102.13|      10|2024-10-26| 1021.3|2024|   10| 26|
|      8| Microwave| Appliances|881.43|       9|2024-12-22|7932.87|2024|   12| 22|
|      9|    Comics|      Books|672.55|      10|2024-04-22| 6725.5|2024|    4| 22|
|   

In [48]:
#Creating pivot table showing monthly sales for each product.
df.groupby('Month').pivot('Product').sum('Total').show()

+-----+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|Month|               Bed|         Biography|           Blender|            Camera|             Chair|            Comics|           Fiction|            Jacket|             Jeans|            Laptop|         Microwave|       Non-Fiction|      Refrigerator|             Shirt|        Smartphone|              Sofa|           Sweater|             Table|            Tablet|           Toaster|
+-----+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------

# User Activity Log

In [51]:
activities = ["Login", "Logout", "Purchase", "ViewProduct", "AddToCart", "Search"]
rows = []
for i in range(1, 10001): 
    user_id = random.randint(1, 1000)
    activity = random.choice(activities)
    timestamp = datetime.datetime.now() - datetime.timedelta(seconds=random.randint(0, 86400 * 30))
    rows.append([user_id, activity, timestamp.strftime("%Y-%m-%d %H:%M:%S")])

with open("user_activity_log.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["UserID", "Activity", "Timestamp"])
    writer.writerows(rows)

print("User activity log saved as 'user_activity_log.csv'")

User activity log saved as 'user_activity_log.csv'


In [16]:
df2=spark.read.csv('user_activity_log.csv',header=True,inferSchema=True)

In [5]:
df2.show()

+------+-----------+-------------------+
|UserID|   Activity|          Timestamp|
+------+-----------+-------------------+
|   485|ViewProduct|2024-12-13 12:49:38|
|   626|ViewProduct|2024-12-06 18:06:12|
|   807|     Search|2024-12-20 15:30:21|
|    50|     Search|2024-12-12 18:49:43|
|   370|ViewProduct|2024-12-08 11:37:11|
|   936|   Purchase|2024-11-25 10:16:43|
|    18|ViewProduct|2024-11-24 21:37:40|
|    17|ViewProduct|2024-12-02 16:17:16|
|   388|  AddToCart|2024-11-28 03:40:03|
|   179|     Logout|2024-12-03 09:08:41|
|   328|      Login|2024-11-29 04:35:56|
|   528|   Purchase|2024-12-19 00:12:44|
|   276|      Login|2024-12-02 04:00:53|
|   526|      Login|2024-11-24 04:28:10|
|   375|     Logout|2024-11-24 18:25:11|
|   213|   Purchase|2024-12-02 07:57:25|
|   609|  AddToCart|2024-12-01 16:55:30|
|   456|  AddToCart|2024-12-08 16:54:21|
|   830|     Logout|2024-12-13 02:16:10|
|   635|ViewProduct|2024-12-03 15:05:27|
+------+-----------+-------------------+
only showing top

In [12]:
#the total number of activities logged.
df2.select(F.count('Activity')).show()

+---------------+
|count(Activity)|
+---------------+
|          10000|
+---------------+



In [61]:
df2.count()

10000

In [138]:
#Count the number of occurrences for each activity.
df2.groupBy('Activity').agg(F.count(df2.Activity)).show()

+-----------+---------------+
|   Activity|count(Activity)|
+-----------+---------------+
|   Purchase|           1611|
|ViewProduct|           1677|
|     Logout|           1706|
|      Login|           1646|
|  AddToCart|           1619|
|     Search|           1741|
+-----------+---------------+



In [139]:
#specifically for login
df2[df2.Activity=='Login'].count()

1646

In [140]:
#the most frequently performed activity.
df2.orderBy('Timestamp',ascending=False).limit(1).show()

+------+--------+-------------------+
|UserID|Activity|          Timestamp|
+------+--------+-------------------+
|   611|  Logout|2024-12-23 11:15:25|
+------+--------+-------------------+



In [13]:
#the user who performed the highest number of activities.
x=df2.groupBy('UserID').agg(F.count(df2.Activity).alias('allactivity'))
x.orderBy('allactivity',ascending=False ).limit(1).show()

+------+-----------+
|UserID|allactivity|
+------+-----------+
|   635|         20|
+------+-----------+



In [150]:
#unique users
df2.select('UserID').distinct().count()

1000

In [129]:
#Identify users who performed more than 18 activities.
x[x['allactivity']>18].show()

+------+-----------+
|UserID|allactivity|
+------+-----------+
|    76|         19|
|   269|         19|
|   635|         20|
|   779|         20|
|   940|         20|
|   965|         20|
|   716|         19|
|    18|         19|
+------+-----------+



In [20]:
#the most active hour of the day based on timestamps.
from pyspark.sql.functions import hour
df2=df2.withColumn('Hour',hour('Timestamp'))
df2.show()

+------+-----------+-------------------+----+
|UserID|   Activity|          Timestamp|Hour|
+------+-----------+-------------------+----+
|   485|ViewProduct|2024-12-13 12:49:38|  12|
|   626|ViewProduct|2024-12-06 18:06:12|  18|
|   807|     Search|2024-12-20 15:30:21|  15|
|    50|     Search|2024-12-12 18:49:43|  18|
|   370|ViewProduct|2024-12-08 11:37:11|  11|
|   936|   Purchase|2024-11-25 10:16:43|  10|
|    18|ViewProduct|2024-11-24 21:37:40|  21|
|    17|ViewProduct|2024-12-02 16:17:16|  16|
|   388|  AddToCart|2024-11-28 03:40:03|   3|
|   179|     Logout|2024-12-03 09:08:41|   9|
|   328|      Login|2024-11-29 04:35:56|   4|
|   528|   Purchase|2024-12-19 00:12:44|   0|
|   276|      Login|2024-12-02 04:00:53|   4|
|   526|      Login|2024-11-24 04:28:10|   4|
|   375|     Logout|2024-11-24 18:25:11|  18|
|   213|   Purchase|2024-12-02 07:57:25|   7|
|   609|  AddToCart|2024-12-01 16:55:30|  16|
|   456|  AddToCart|2024-12-08 16:54:21|  16|
|   830|     Logout|2024-12-13 02:

In [25]:
y=df2.groupBy('Hour').agg(F.count('Activity'))
y.orderBy('count(Activity)',ascending=False).limit(1).show()

+----+---------------+
|Hour|count(Activity)|
+----+---------------+
|   0|            457|
+----+---------------+

