In [0]:
# =====================================================
# USE CASE 1: Predictive Churn Model Development (MLlib)
# Project: Viewer Churn Prediction for OTT Platforms
# Platform: Databricks (Free Edition)
# Author: Erugurala Teja (24MBMB19)
# =====================================================

# Step 1️⃣: Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, length, regexp_replace, lower
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
import pandas as pd

In [0]:
# Step 2️⃣: Import Required Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, length, regexp_replace, lower
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
import pandas as pd

In [0]:
# Step 3️⃣: Initialize Spark Session
spark = SparkSession.builder.appName("OTT_Churn_Model").getOrCreate()


In [0]:
# Step 4️⃣: Load the Dataset
# Ensure your file exists in the mentioned path
data_path = "/Volumes/workspace/default/dataset/ott_reviews.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

print("✅ Data successfully loaded!")
print(f"Total Records: {df.count()}")
df.printSchema()

✅ Data successfully loaded!
Total Records: 6000
root
 |-- app_name: string (nullable = true)
 |-- reviewId: string (nullable = true)
 |-- userName: string (nullable = true)
 |-- content: string (nullable = true)
 |-- score: string (nullable = true)
 |-- at: string (nullable = true)



In [0]:
# Step 5️⃣: Basic Data Cleaning
# Remove null/empty reviews, clean special characters, and lowercase
df_clean = df.dropna(subset=["content", "score"])
df_clean = df_clean.withColumn("content", regexp_replace("content", "[^a-zA-Z ]", " "))
df_clean = df_clean.withColumn("content", lower(col("content")))
df_clean = df_clean.filter(length(col("content")) > 20)

print("✅ Data cleaned successfully — sample records:")
df_clean.select("app_name", "score", "content").show(5, truncate=False)


✅ Data cleaned successfully — sample records:
+--------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|app_name|score               |content                                                                                                                                                                                                                                                                                                                                                                                               

In [0]:
from pyspark.sql.functions import col, when, expr

# Step 6️⃣: Ensure 'score' column is numeric (safe casting)
# Some values might be text; we use try_cast to handle that gracefully
df_fixed = df_clean.withColumn("score_num", expr("try_cast(score as double)"))

# Drop rows where score cannot be converted to a number
df_fixed = df_fixed.dropna(subset=["score_num"])

print("✅ Score column successfully converted to numeric!")
df_fixed.select("app_name", "score", "score_num").show(5)


✅ Score column successfully converted to numeric!
+--------+-----+---------+
|app_name|score|score_num|
+--------+-----+---------+
| Netflix|    5|      5.0|
| Netflix|    1|      1.0|
| Netflix|    1|      1.0|
| Netflix|    1|      1.0|
| Netflix|    5|      5.0|
+--------+-----+---------+
only showing top 5 rows


In [0]:
# Step 7️⃣: Create Target Variable (Churn Label)
# Logic: score <= 2 → likely churn, score >= 4 → retained
df_labeled = df_fixed.withColumn(
    "churn_label",
    when(col("score_num") <= 2, 1).when(col("score_num") >= 4, 0).otherwise(None)
).dropna(subset=["churn_label"])

print("✅ Target variable 'churn_label' created successfully!")
df_labeled.select("app_name", "score_num", "churn_label").show(5)


✅ Target variable 'churn_label' created successfully!
+--------+---------+-----------+
|app_name|score_num|churn_label|
+--------+---------+-----------+
| Netflix|      5.0|          0|
| Netflix|      1.0|          1|
| Netflix|      1.0|          1|
| Netflix|      1.0|          1|
| Netflix|      5.0|          0|
+--------+---------+-----------+
only showing top 5 rows


In [0]:
# =====================================================
# CONTINUE PIPELINE — TEXT PREPROCESSING & MODELING
# =====================================================

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [0]:
# Step 8️⃣: Text Preprocessing Pipeline
tokenizer = Tokenizer(inputCol="content", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
tf = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=2000)
idf = IDF(inputCol="rawFeatures", outputCol="features")


In [0]:
# Step 9️⃣: Model Definition (Logistic Regression)
lr = LogisticRegression(labelCol="churn_label", featuresCol="features", maxIter=10)


In [0]:
# Step 🔟: Create Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, tf, idf, lr])


In [0]:
# Step 1️⃣1️⃣: Split Data (Train/Test)
train_df, test_df = df_labeled.randomSplit([0.8, 0.2], seed=42)
print(f"✅ Data split completed — Train: {train_df.count()}, Test: {test_df.count()}")


✅ Data split completed — Train: 2321, Test: 568


In [0]:
# Step 1️⃣2️⃣: Train Model
model = pipeline.fit(train_df)
print("✅ Model training completed successfully!")


✅ Model training completed successfully!


In [0]:
# Step 1️⃣3️⃣: Generate Predictions
predictions = model.transform(test_df)
print("✅ Predictions generated successfully — preview:")
predictions.select("app_name", "score_num", "churn_label", "probability", "prediction").show(5, truncate=False)


✅ Predictions generated successfully — preview:
+------------------+---------+-----------+-------------------------------------------+----------+
|app_name          |score_num|churn_label|probability                                |prediction|
+------------------+---------+-----------+-------------------------------------------+----------+
|Amazon Prime Video|2.0      |1          |[0.4195849417861692,0.5804150582138308]    |1.0       |
|Amazon Prime Video|1.0      |1          |[1.8150127631962167E-6,0.9999981849872368] |1.0       |
|Amazon Prime Video|1.0      |1          |[8.106512443979171E-8,0.9999999189348756]  |1.0       |
|Amazon Prime Video|1.0      |1          |[7.555650576944076E-7,0.9999992444349423]  |1.0       |
|Amazon Prime Video|1.0      |1          |[3.3558786676488055E-10,0.9999999996644121]|1.0       |
+------------------+---------+-----------+-------------------------------------------+----------+
only showing top 5 rows


In [0]:
# Step 1️⃣4️⃣: Evaluate Model Performance
evaluator = BinaryClassificationEvaluator(labelCol="churn_label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"📊 Model Evaluation Completed — AUC Score: {auc:.3f}")


📊 Model Evaluation Completed — AUC Score: 0.797


In [0]:
# Step 1️⃣5️⃣: Humanized Output Summary
total = predictions.count()
churned = predictions.filter(col("prediction") == 1).count()
retained = predictions.filter(col("prediction") == 0).count()

print("===============================================")
print("📈 HUMANIZED OUTPUT SUMMARY")
print(f"Total Evaluated Reviews: {total}")
print(f"Predicted Churned Users : {churned}")
print(f"Predicted Retained Users: {retained}")
print(f"Churn Ratio: {(churned / total * 100):.2f}%")
print(f"Model Accuracy (AUC): {auc:.2f}")
print("===============================================")


📈 HUMANIZED OUTPUT SUMMARY
Total Evaluated Reviews: 568
Predicted Churned Users : 423
Predicted Retained Users: 145
Churn Ratio: 74.47%
Model Accuracy (AUC): 0.80


In [0]:
# Step 1️⃣6️⃣: Save Predictions to CSV (Downloadable)
output_path = "/Volumes/workspace/default/dataset/ott_churn_predictions"
predictions.select(
    "app_name", "userName", "score_num", "content", "churn_label", "prediction"
).coalesce(1).write.mode("overwrite").option("header", True).csv(output_path)

print(f"✅ Predictions saved successfully to: {output_path}")
print("📂 Download from: Databricks → Data → Volumes → workspace → default → dataset → ott_churn_predictions")


✅ Predictions saved successfully to: /Volumes/workspace/default/dataset/ott_churn_predictions
📂 Download from: Databricks → Data → Volumes → workspace → default → dataset → ott_churn_predictions
