In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace

# created as 'spark'
spark = SparkSession.builder.appName('Session1').getOrCreate()

# Define the path to your CSV file on Google Drive
drive_csv_path = '/content/drive/MyDrive/Colab Notebooks/Digital_Marketing/optimized_output.csv'

# Read the CSV file into a PySpark DataFrame
df = spark.read.csv(drive_csv_path, header=True, inferSchema=True, sep=',')

# Rename columns to replace spaces with underscores for easier access
for column_name in df.columns:
    new_column_name = column_name.replace(' ', '_')
    df = df.withColumnRenamed(column_name, new_column_name)

# Convert string columns with commas to numeric types
df = df.withColumn(
    "product_detail_view_per_app_session",
    regexp_replace(col("product_detail_view_per_app_session"), ",", ".").cast("double")
)
df = df.withColumn(
    "add_to_cart_per_session",
    regexp_replace(col("add_to_cart_per_session"), ",", ".").cast("double")
)
df = df.withColumn(
    "avg_order_value",
    regexp_replace(col("avg_order_value"), ",", ".").cast("double")
)
df = df.withColumn(
    "discount_rate_per_visited_products",
    regexp_replace(col("discount_rate_per_visited_products"), ",", ".").cast("double")
)

# Display the schema and first few rows to verify
print("Schema of DataFrame read from Drive:")
df.printSchema()
print("First 5 rows of DataFrame read from Drive:")
df.show(5)

Schema of DataFrame read from Drive:
root
 |-- account_length: integer (nullable = true)
 |-- location_code: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- add_to_wishlist: integer (nullable = true)
 |-- desktop_sessions: integer (nullable = true)
 |-- app_sessions: integer (nullable = true)
 |-- desktop_transactions: integer (nullable = true)
 |-- total_product_detail_views: integer (nullable = true)
 |-- session_duration: integer (nullable = true)
 |-- promotion_clicks: integer (nullable = true)
 |-- avg_order_value: double (nullable = true)
 |-- sale_product_views: integer (nullable = true)
 |-- discount_rate_per_visited_products: double (nullable = true)
 |-- product_detail_view_per_app_session: double (nullable = true)
 |-- app_transactions: integer (nullable = true)
 |-- add_to_cart_per_session: double (nullable = true)
 |-- customer_service_calls: integer (nullable = true)
 |-- churn: integer (nullable = true)
 |-- avg_app_sessions_by_location: double (nu

In [None]:
# Model Pipeline
from pyspark.ml.feature import (
    StringIndexer, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline


In [None]:
# Feature Vectorization
assembler = VectorAssembler(
    inputCols=[
        "account_length",
        "desktop_sessions",
        "app_sessions",
        "avg_order_value",
        "discount_rate_per_visited_products",
        "product_detail_view_per_app_session",
        "add_to_cart_per_session",
        "customer_service_calls",
        "cc_saved_idx",
        "push_status_idx"
    ],
    outputCol="features"
)

In [None]:
# Model Definition
lr = LogisticRegression(
    featuresCol="features",
    labelCol="churn",
    maxIter=20
)


In [None]:
# Pipeline & Training
pipeline = Pipeline(stages= [assembler, lr])

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train_df)

In [None]:
# Model Evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.transform(test_df)

evaluator = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print(f"AUC Score: {auc}")


AUC Score: 0.8396356275303687


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [None]:
# Feature Vector
assembler = VectorAssembler(
    inputCols=[
        "account_length",
        "desktop_sessions",
        "app_sessions",
        "desktop_transactions",
        "app_transactions",
        "total_product_detail_views",
        "session_duration",
        "promotion_clicks",
        "avg_order_value", # Now directly 'double'
        "sale_product_views",
        "discount_rate_per_visited_products", # Now directly 'double'
        "product_detail_view_per_app_session", # Now directly 'double'
        "add_to_cart_per_session", # Now directly 'double'
        "customer_service_calls",
        "avg_app_sessions_by_location",
        "transaction_per_session",
        "cc_saved_idx",
        "push_status_idx"
    ],
    outputCol="features"
)

In [None]:
# GBM Model
gbt = GBTClassifier(
    labelCol="churn",
    featuresCol="features",
    seed=42
)

In [None]:
# Parameter Grid (Carefully Chosen)
paramGrid = (
    ParamGridBuilder()
    .addGrid(gbt.maxDepth, [4, 6, 8])
    .addGrid(gbt.maxIter, [20, 50, 80])
    .addGrid(gbt.stepSize, [0.05, 0.1])
    .build()
)


Why these parameters?

*   maxDepth → controls interaction complexity
*   maxIter → number of boosting trees
*   stepSize → learning rate (smaller = safer)



In [None]:
# Cross Validator
evaluator = BinaryClassificationEvaluator(
    labelCol="churn",
    metricName="areaUnderROC"
)

pipeline = Pipeline(stages=[assembler, gbt])

cv = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,
    parallelism=4
)


### Hyperparameter Tuning
A Grid Search with 3-fold cross-validation was performed on the GBM model to optimize tree depth, number of boosting iterations, and learning rate. The tuned model demonstrated improved AUC and better generalization compared to default parameters.


In [None]:
feature_columns = [
    "account_length",
    "desktop_sessions",
    "app_sessions",
    "desktop_transactions",
    "app_transactions",
    "total_product_detail_views",
    "session_duration",
    "promotion_clicks",
    "avg_order_value", # Now directly 'double'
    "sale_product_views",
    "discount_rate_per_visited_products", # Now directly 'double'
    "product_detail_view_per_app_session", # Now directly 'double'
    "add_to_cart_per_session", # Now directly 'double'
    "customer_service_calls",
    "avg_app_sessions_by_location",
    "transaction_per_session",
    "cc_saved_idx",
    "push_status_idx"
]

# Drop rows with nulls in the feature columns
df_cleaned = df.na.drop(subset=feature_columns)


Grid search with cross-validation was used to tune GBM hyperparameters, ensuring optimal bias–variance tradeoff and robust churn prediction performance.

In [None]:
# Train-test split
train_df, test_df = df_cleaned.randomSplit([0.8, 0.2], seed=42)

# Fit grid search model
cv_model = cv.fit(train_df)

# Evaluate best model
best_model = cv_model.bestModel
predictions = best_model.transform(test_df)

auc = evaluator.evaluate(predictions)
print(f"Final AUC: {auc}")


Final AUC: 0.9308123249299687


In [57]:
# Understanding Best Model
best_gbt = best_model.stages[-1]

print("Best GBM Parameters:")
print("Max Depth:", best_gbt.getMaxDepth())
print("Max Iterations:", best_gbt.getMaxIter())
print("Step Size:", best_gbt.getStepSize())


Best GBM Parameters:
Max Depth: 4
Max Iterations: 80
Step Size: 0.1


In [58]:
# Checking whether features makes sense for model
importances = best_gbt.featureImportances

for f, imp in zip(assembler.getInputCols(), importances):
    print(f"{f}: {round(imp, 4)}")


account_length: 0.0328
desktop_sessions: 0.1343
app_sessions: 0.0689
desktop_transactions: 0.0381
app_transactions: 0.0651
total_product_detail_views: 0.0238
session_duration: 0.1311
promotion_clicks: 0.0104
avg_order_value: 0.0693
sale_product_views: 0.0146
discount_rate_per_visited_products: 0.0106
product_detail_view_per_app_session: 0.1071
add_to_cart_per_session: 0.0
customer_service_calls: 0.0765
avg_app_sessions_by_location: 0.0016
transaction_per_session: 0.0213
cc_saved_idx: 0.1066
push_status_idx: 0.088


The GBM model highlights engagement intensity, session quality, and payment convenience as the strongest drivers of customer churn. Behavioral features such as desktop sessions (0.134), session duration (0.131), and product detail views per app session (0.107) carry the highest importance, indicating that how deeply customers engage matters more than raw activity counts.

Payment and notification signals like credit card saved status (0.107) and push notification enablement (0.088) also play a significant role, suggesting that reduced friction and proactive communication help retain users.

Interestingly, customer service calls (0.077) and average order value (0.069) show moderate influence, reflecting post-purchase experience and customer value as key churn indicators. Conversely, features such as add-to-cart per session (0.0) and location-level aggregates (0.002) contribute little, implying limited predictive power in isolation.

Overall, the model learns that depth of engagement and ease of transactions outweigh surface-level interactions in predicting churn.