In [0]:
# =====================================================
# USE CASE 5: Customer Journey & Influence Mapping (GraphX Alternative)
# Project: Viewer Churn Prediction for OTT Platforms
# Platform: Databricks Free Edition
# Author: Erugurala Teja (24MBMB19)
# =====================================================

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, lit, when, row_number, rand
from pyspark.sql.window import Window

In [0]:
# Step 1️⃣: Initialize Spark Session
spark = SparkSession.builder.appName("OTT_Influence_Graph_PySpark").getOrCreate()


In [0]:
# Step 2️⃣: Load OTT Reviews Dataset
input_path = "/Volumes/workspace/default/dataset/ott_reviews.csv"
df = spark.read.csv(input_path, header=True, inferSchema=True)
print(f"✅ Loaded {df.count()} records from {input_path}")


✅ Loaded 6000 records from /Volumes/workspace/default/dataset/ott_reviews.csv


In [0]:
# Step 3️⃣: Data Cleaning
df_clean = (
    df.dropna(subset=["userName", "content"])
      .withColumn("content", regexp_replace(lower(col("content")), "[^a-z ]", ""))
)

In [0]:
# Step 4️⃣: Create Vertices (Unique Users)
vertices = df_clean.select("userName").distinct().withColumnRenamed("userName", "id")
print(f"✅ Created {vertices.count()} unique user vertices.")


✅ Created 5882 unique user vertices.


In [0]:
# =====================================================
# Step 5️⃣: Simulate Viewer Influence Network (No Window Frame Errors)
# =====================================================

from pyspark.sql.functions import monotonically_increasing_id, rand

# Assign each user a unique numeric ID for pairing
vertices_with_id = vertices.withColumn("user_id", monotonically_increasing_id())

# Randomly join users to simulate influence relationships
edges = (
    vertices_with_id.alias("a")
    .join(vertices_with_id.alias("b"), col("a.user_id") != col("b.user_id"))
    .select(
        col("a.id").alias("src_user"),
        col("b.id").alias("dst_user")
    )
    .sample(False, 0.001, seed=42)  # keep small sample for performance
    .withColumn("interaction_strength", rand())
)

print(f"✅ Created {edges.count()} simulated viewer connections (edges).")


✅ Created 34550 simulated viewer connections (edges).


In [0]:
# =====================================================
# Step 6️⃣: Influence Score (Degree Centrality)
# =====================================================

# Influence = number of times a user appears as a target (incoming edges)
influence_df = (
    edges.groupBy("dst_user")
         .count()
         .withColumnRenamed("count", "influence_score")
         .withColumnRenamed("dst_user", "id")
)

# Normalize influence score between 0 and 1
max_score = influence_df.agg({"influence_score": "max"}).collect()[0][0]
influence_df = influence_df.withColumn("influence_score", col("influence_score") / lit(max_score))
print("✅ Influence scores calculated (normalized).")


✅ Influence scores calculated (normalized).


In [0]:
# =====================================================
# Step 7️⃣: Viewer Communities (Simulated Clustering)
# =====================================================

# Randomly assign community groups to simulate clusters
vertices_clustered = vertices.withColumn("component", (rand() * 10).cast("int"))
print("✅ Viewer clusters (components) assigned randomly (10 groups).")


✅ Viewer clusters (components) assigned randomly (10 groups).


In [0]:
# =====================================================
# Step 8️⃣: Combine Influence + Clusters
# =====================================================

final_graph_summary = (
    vertices_clustered.join(influence_df, "id", "left")
                      .fillna({"influence_score": 0.0})
                      .orderBy(col("influence_score").desc())
)

display(final_graph_summary)

id,component,influence_score
Anilkamalasanan 8,8,1.0
UDEME JIMMY,4,0.8888888888888888
Andrew Hennessy,2,0.8888888888888888
JJ,9,0.8888888888888888
Sanjana Kamble,5,0.8888888888888888
Johan Kashyap,0,0.8888888888888888
Appu Anandkumar,5,0.8888888888888888
stanzin padma,3,0.8888888888888888
A Gan,3,0.8333333333333334
Shilpa Khude,0,0.7777777777777778


In [0]:
# =====================================================
# Step 9️⃣: Export to Volumes for Viva
# =====================================================

export_path = "/Volumes/workspace/default/dataset/ott_influence_graph_export"
final_graph_summary.coalesce(1).write.mode("overwrite").option("header", True).csv(export_path)

print("✅ Exported successfully!")
print("📂 Download Instructions:")
print("1️⃣  Go to: Data ➜ Volumes ➜ workspace ➜ default ➜ dataset ➜ ott_influence_graph_export")
print("2️⃣  Locate: part-00000-xxxxx.csv → Right-click ➜ Download")
print("===============================================")
print("📈 HUMANIZED OUTPUT SUMMARY")
final_graph_summary.show(10, truncate=False)
print("===============================================")

✅ Exported successfully!
📂 Download Instructions:
1️⃣  Go to: Data ➜ Volumes ➜ workspace ➜ default ➜ dataset ➜ ott_influence_graph_export
2️⃣  Locate: part-00000-xxxxx.csv → Right-click ➜ Download
📈 HUMANIZED OUTPUT SUMMARY
+-----------------+---------+------------------+
|id               |component|influence_score   |
+-----------------+---------+------------------+
|Anilkamalasanan 8|8        |1.0               |
|Sanjana Kamble   |5        |0.8888888888888888|
|UDEME JIMMY      |4        |0.8888888888888888|
|Appu Anandkumar  |5        |0.8888888888888888|
|JJ               |9        |0.8888888888888888|
|stanzin padma    |3        |0.8888888888888888|
|Andrew Hennessy  |2        |0.8888888888888888|
|Johan Kashyap    |0        |0.8888888888888888|
|A Gan            |3        |0.8333333333333334|
|Sriram TR        |9        |0.7777777777777778|
+-----------------+---------+------------------+
only showing top 10 rows
