In [0]:
!pip install kaggle

In [0]:
import os

os.environ["KAGGLE_USERNAME"] = "sumeshmajee"
os.environ["KAGGLE_KEY"] = "KGAT_e8b80db00821caf2bf9e6a6690d004c4"

print("Kaggle credentials configured!")

In [0]:
spark.sql("""
CREATE SCHEMA IF NOT EXISTS workspace.ecommerce
""")

In [0]:
spark.sql("""
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.ecommerce_data
""")

In [0]:
%sh
cd /Volumes/workspace/ecommerce/ecommerce_data
kaggle datasets download -d mkechinov/ecommerce-behavior-data-from-multi-category-store

In [0]:
%sh
cd /Volumes/workspace/ecommerce/ecommerce_data
unzip -o ecommerce-behavior-data-from-multi-category-store.zip
ls -lh

In [0]:
%sh
cd /Volumes/workspace/ecommerce/ecommerce_data
rm -f ecommerce-behavior-data-from-multi-category-store.zip
ls -lh

In [0]:
%restart_python

In [0]:
df_n = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")

In [0]:
df = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

In [0]:
print(f"October 2019 - Total Events: {df.count():,}")
print("\n" + "="*60)
print("SCHEMA:")
print("="*60)
df.printSchema()

In [0]:
print("\n" + "="*60)
print("SAMPLE DATA (First 5 rows):")
print("="*60)
df.show(5, truncate=False)

Day 1


In [0]:
# Create simple DataFrame
data = [("iPhone", 999), ("Samsung", 799), ("MacBook", 1299)]
df = spark.createDataFrame(data, ["product", "price"])
df.show()

# Filter expensive products
df.filter(df.price > 1000).show() 


In [0]:
import os
import getpass

# 1. Enter your token securely when prompted (It won't be saved in the file)
print("Paste your GitHub Token below and hit Enter:")
GITHUB_TOKEN = getpass.getpass()

# --- Everything else stays the same ---
USERNAME = "SRMajee"
REPO_NAME = "DataBricks_Challenge"
auth_url = f"https://{GITHUB_TOKEN}@github.com/{USERNAME}/{REPO_NAME}.git"

# 2. Update Remote
!git -c safe.directory='*' remote set-url origin {auth_url}

# 3. Add files (Now the file doesn't contain the secret!)
!git -c safe.directory='*' add .

# 4. Commit
!git -c safe.directory='*' commit -m "Secure push using getpass"

# 5. Push
!git -c safe.directory='*' push origin main

Day 2

In [0]:
# Load data
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)


In [0]:

# Basic operations
events.select("event_time", "product_id", "price").show(10)
events.filter("price > 100").count()
events.groupBy("event_time").count().show()
top_brands = events.groupBy("brand").count().orderBy("count", ascending=False).limit(5)


In [0]:
import os
import getpass

# 1. UNDO the last commit (so we can save it again with the private email)
!git -c safe.directory='*' reset --soft HEAD~1

# 2. SET PRIVATE IDENTITY (Using your GitHub no-reply alias)
# This format (username@users.noreply.github.com) hides your real email
os.environ["GIT_AUTHOR_NAME"] = "Sumesh Majee"
os.environ["GIT_AUTHOR_EMAIL"] = "SRMajee@users.noreply.github.com"
os.environ["GIT_COMMITTER_NAME"] = "Sumesh Majee"
os.environ["GIT_COMMITTER_EMAIL"] = "SRMajee@users.noreply.github.com"

# 3. ASK FOR TOKEN
print("Paste your GitHub Token below and hit Enter:")
GITHUB_TOKEN = getpass.getpass()

# 4. PREPARE URL
USERNAME = "SRMajee"
REPO_NAME = "DataBricks_Challenge"
auth_url = f"https://{GITHUB_TOKEN}@github.com/{USERNAME}/{REPO_NAME}.git"

# 5. COMMIT & PUSH
!git -c safe.directory='*' remote set-url origin {auth_url}
!git -c safe.directory='*' commit -m "Day 2"
!git -c safe.directory='*' push origin main

Day 3

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Top 5 products by revenue
revenue = (
    events.filter(F.col("event_type") == "purchase")
    .groupBy("product_id", "product_id")
    .agg(F.sum("price").alias("revenue"))
    .orderBy(F.desc("revenue"))
    .limit(5)
)

# Running total per user
window = Window.partitionBy("user_id").orderBy("event_time")
events_with_cumulative = events.withColumn(
    "cumulative_events",
    F.count("*").over(window)
)

# Conversion rate by category (replace pivot with conditional aggregation)
conversion = (
    events.groupBy("category_code")
    .agg(
        F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchase"),
        F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("view")
    )
    .withColumn(
        "conversion_rate",
        F.col("purchase") / F.col("view") * 100
    )
)

display(revenue)
display(events_with_cumulative)
display(conversion)

In [0]:
import os
import getpass

# 1. UNDO the last commit (so we can try the process again from a clean state)
# We do this to ensure the commit and pull happen in the right order.
!git -c safe.directory='*' reset --soft HEAD~1

# 2. SET PRIVATE IDENTITY
os.environ["GIT_AUTHOR_NAME"] = "Sumesh Majee"
os.environ["GIT_AUTHOR_EMAIL"] = "SRMajee@users.noreply.github.com"
os.environ["GIT_COMMITTER_NAME"] = "Sumesh Majee"
os.environ["GIT_COMMITTER_EMAIL"] = "SRMajee@users.noreply.github.com"

# 3. ASK FOR TOKEN
print("Paste your GitHub Token below and hit Enter:")
GITHUB_TOKEN = getpass.getpass()

# 4. PREPARE URL
USERNAME = "SRMajee"
REPO_NAME = "DataBricks_Challenge"
auth_url = f"https://{GITHUB_TOKEN}@github.com/{USERNAME}/{REPO_NAME}.git"

# 5. COMMIT, PULL (WITH FIX), THEN PUSH
!git -c safe.directory='*' remote set-url origin {auth_url}
!git -c safe.directory='*' commit -m "Day 3"

# FIX IS HERE: We added '--no-rebase' to tell Git to use the default merge strategy
!git -c safe.directory='*' pull origin main --no-rebase --no-edit

# NOW PUSH
!git -c safe.directory='*' push origin main

Day 4

In [0]:
# Use a Unity Catalog volume directory path, not a file
volume_path = "/Volumes/workspace/ecommerce/ecommerce_data/events_delta"

# Write DataFrame as Delta table to the directory
events.write.format("delta").mode("overwrite").save(volume_path)

# Register the Delta table in Unity Catalog
events.write.format("delta").saveAsTable("workspace.ecommerce.events_table")

# SQL approach to create a managed Delta table
spark.sql("""
    CREATE TABLE workspace.ecommerce.events_delta
    USING DELTA
    AS SELECT * FROM workspace.ecommerce.events_table
""")

# Test schema enforcement
try:
    wrong_schema = spark.createDataFrame([("a","b","c")], ["x","y","z"])
    wrong_schema.write.format("delta").mode("append").save(volume_path)
except Exception as e:
    print(f"Schema enforcement: {e}")

In [0]:
import os
import getpass

# # 1. UNDO the last commit (so we can try the process again from a clean state)
# # We do this to ensure the commit and pull happen in the right order.
# !git -c safe.directory='*' reset --soft HEAD~1

# 2. SET PRIVATE IDENTITY
os.environ["GIT_AUTHOR_NAME"] = "Sumesh Majee"
os.environ["GIT_AUTHOR_EMAIL"] = "SRMajee@users.noreply.github.com"
os.environ["GIT_COMMITTER_NAME"] = "Sumesh Majee"
os.environ["GIT_COMMITTER_EMAIL"] = "SRMajee@users.noreply.github.com"

# 3. ASK FOR TOKEN
print("Paste your GitHub Token below and hit Enter:")
GITHUB_TOKEN = getpass.getpass()

# 4. PREPARE URL
USERNAME = "SRMajee"
REPO_NAME = "DataBricks_Challenge"
auth_url = f"https://{GITHUB_TOKEN}@github.com/{USERNAME}/{REPO_NAME}.git"

# 5. ADD, COMMIT, PULL, THEN PUSH
!git -c safe.directory='*' remote set-url origin {auth_url}

# --- FIX: ADD FILES FIRST ---
!git -c safe.directory='*' add . 
# ----------------------------

!git -c safe.directory='*' commit -m "Day 4"

# Pull just in case there are remote changes
!git -c safe.directory='*' pull origin main --no-rebase --no-edit

# NOW PUSH
!git -c safe.directory='*' push origin main