<a href="https://colab.research.google.com/github/RajuKGosala-45/E-Commerce-dataset-PySpark-Practices/blob/main/E_Commerce_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark Install in Google Collab

In [None]:
!apt-get update -qq
!apt-get install -y openjdk-17-jdk-headless -qq

!pip install pyspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-17-openjdk-amd64/bin"



W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
(Reading database ... 121713 files and directories currently installed.)
Preparing to unpack .../openjdk-17-jdk-headless_17.0.17+10-1~22.04_amd64.deb ...
Unpacking openjdk-17-jdk-headless:amd64 (17.0.17+10-1~22.04) over (17.0.16+8~us1-0ubuntu1~22.04.1) ...
Preparing to unpack .../openjdk-17-jre-headless_17.0.17+10-1~22.04_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.17+10-1~22.04) over (17.0.16+8~us1-0ubuntu1~22.04.1) ...
Setting up openjdk-17-jre-headless:amd64 (17.0.17+10-1~22.04) ...
Installing new version of config file /etc/java-17-openjdk/security/default.policy ...
Installing new version of config file /etc/java-17-openjdk/security/java.security ...
Setting up openjdk-17-jdk-headless:amd64 (17.0.17+10-1~22.04) ...


# Set up SparkSession and Load Data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("E-Commerce Bussiness").getOrCreate()

Files ={
    "events":"/content/events.csv",
    "order_items":"/content/order_items.csv",
    "orders":"/content/orders.csv",
    "products":"/content/products.csv",
    "reviews":"/content/reviews.csv",
    "users":"/content/users.csv"
}

df= {name: spark.read.csv(path, header=True, inferSchema=True) for name, path in Files.items()}

# Print Schema and Sample

In [None]:
for name, df in df.items():
  print(f"\n Dataset: {name.upper()}")
  df.printSchema()
  df.show(5)


 Dataset: EVENTS-----------------
root
 |-- event_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)

+---------+-------+----------+----------+--------------------+
| event_id|user_id|product_id|event_type|     event_timestamp|
+---------+-------+----------+----------+--------------------+
|E00000001|U009798|   P001393|      cart|2025-07-08 14:28:...|
|E00000002|U005881|   P000669|      view|2025-10-19 23:00:...|
|E00000003|U006348|   P001404|      view|2025-05-09 07:02:...|
|E00000004|U002664|   P000400|      cart|2025-07-19 22:47:...|
|E00000005|U005776|   P000392|      view|2024-10-24 10:20:...|
+---------+-------+----------+----------+--------------------+
only showing top 5 rows


 Dataset: ORDER_ITEMS-----------------
root
 |-- order_item_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product_id: string 

# Missing Values Summary

In [None]:

df = {name: spark.read.csv(path, header=True, inferSchema=True) for name, path in Files.items()}

for name, current_df in df.items():
  print(f"\n Missing Values Summary For: {name.upper()}")

  null_expressions = []
  for field in current_df.schema:
    column_name = field.name

    # Start with the isNull() check for all column types
    condition = col(column_name).isNull()

    # Add check for empty strings if the column is of StringType
    if isinstance(field.dataType, StringType):
      condition = condition | (col(column_name) == "")

    # Add check for NaN if the column is of NumericType
    # NumericType includes DoubleType, FloatType, IntegerType, etc.
    if isinstance(field.dataType, NumericType):
      condition = condition | isnan(col(column_name))

    null_expressions.append(
        (count(when(condition, col(column_name))) / current_df.count() * 100).alias(column_name)
    )

  null_df = current_df.select(null_expressions)
  null_df.show()


 Missing Values Summary For: EVENTS
+--------+-------+----------+----------+---------------+
|event_id|user_id|product_id|event_type|event_timestamp|
+--------+-------+----------+----------+---------------+
|     0.0|    0.0|       0.0|       0.0|            0.0|
+--------+-------+----------+----------+---------------+


 Missing Values Summary For: ORDER_ITEMS
+-------------+--------+----------+-------+--------+----------+----------+
|order_item_id|order_id|product_id|user_id|quantity|item_price|item_total|
+-------------+--------+----------+-------+--------+----------+----------+
|          0.0|     0.0|       0.0|    0.0|     0.0|       0.0|       0.0|
+-------------+--------+----------+-------+--------+----------+----------+


 Missing Values Summary For: ORDERS
+--------+-------+----------+------------+------------+
|order_id|user_id|order_date|order_status|total_amount|
+--------+-------+----------+------------+------------+
|     0.0|    0.0|       0.0|         0.0|         0.0

# Duplicate check Using primary Keys

In [None]:
Primary_keys ={
    "users":"user_id",
    "orders":"order_id",
    "order_items":"order_item_id",
    "products":"product_id",
    "reviews":"review_id",
    "events":"event_id"
}

for name, df in df.items():
  key = Primary_keys.get(name)
  if key:
    dup_count = df.groupBy(key).count().filter("count > 1").count()
    print(f"Duplicate {key} in {name}: {dup_count}")

Duplicate event_id in events: 0
Duplicate order_item_id in order_items: 0
Duplicate order_id in orders: 0
Duplicate product_id in products: 0
Duplicate review_id in reviews: 0
Duplicate user_id in users: 0


# Statistical summary Stats for Numerical Columns


In [None]:
for name, df in df.items():
  print(f"\n Summary Stats for: {name.upper()}")
  numeric_cols = [c for c, t in df.dtypes if t in ("int","bigint","double","float")]
  if numeric_cols:
    df.select(numeric_cols).describe().show()
print("\n Day 51 Completed - Data Profilling Done ")


 Summary Stats for: EVENTS

 Summary Stats for: ORDER_ITEMS
+-------+------------------+------------------+-----------------+
|summary|          quantity|        item_price|       item_total|
+-------+------------------+------------------+-----------------+
|  count|             43525|             43525|            43525|
|   mean|1.3973118897185526|196.63697622056128|273.8350132108008|
| stddev|0.6607172757815206|  297.259223858562|471.5851039154428|
|    min|                 1|              1.11|             1.11|
|    max|                 3|           2338.13|          7014.39|
+-------+------------------+------------------+-----------------+


 Summary Stats for: ORDERS
+-------+-----------------+
|summary|     total_amount|
+-------+-----------------+
|  count|            20000|
|   mean|595.9334475000019|
| stddev|776.0633359009934|
|    min|             1.11|
|    max|          7950.74|
+-------+-----------------+


 Summary Stats for: PRODUCTS
+-------+------------------+-----