In [None]:
pip install delta-spark

In [None]:
pip install mlflow

In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# Step 1: Initialize Spark with Delta extensions
builder = SparkSession.builder \
    .appName("ChurnDeltaMLPipeline") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
# Step 2: Load raw CSV file
csv_path = "Downloads/telecom_churn_8000.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)

# Step 3: Write as Delta table (raw format persisted with schema)
delta_path = "Downloads/telco_delta"
df.write.format("delta").mode("overwrite").save(delta_path)

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Reload the Delta table (if not already done)
df_raw = spark.read.format("delta").load("Downloads/telco_delta")

df_raw = df_raw.withColumn("Churn", col("Churn").cast("string"))

# Correct column references based on your dataset
categorical_cols = ['International_plan', 'Voicemail_plan']
numeric_cols = [
    'Account_length', 'Number_vmail_messages', 'Total_day_minutes',
    'Total_eve_minutes', 'Total_night_minutes', 'Total_intl_minutes',
    'Customer_service_calls'
]
label_col = 'Churn'

# Step 1: Index categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index") for col in categorical_cols]

# Step 2: Convert 'Churn' to numeric label
indexers += [StringIndexer(inputCol=label_col, outputCol="label")]

# Step 3: Assemble features
feature_cols = numeric_cols + [f"{col}_index" for col in categorical_cols]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Step 4: Create transformation pipeline
pipeline = Pipeline(stages=indexers + [assembler])
model = pipeline.fit(df_raw)
df_processed = model.transform(df_raw).select("features", "label")


In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Step 1: Split dataset
train_df, test_df = df_processed.randomSplit([0.8, 0.2], seed=42)

# Step 2: Initialize classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxDepth=5)

# Step 3: Train model
rf_model = rf.fit(train_df)

# Step 4: Predict and evaluate
predictions = rf_model.transform(test_df)
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print(f"Test AUC: {auc:.4f}")

In [None]:
import mlflow
import mlflow.spark
from delta.tables import DeltaTable

# Set the MLflow experiment
mlflow.set_experiment("/VisionaryRetail/ChurnPrediction")

# Path where your Delta table is stored
delta_path = "/Users/dipankarmazumdar/Downloads/telco_delta"

# Start an MLflow run and log metadata
with mlflow.start_run(run_name="RandomForest_v1"):

    # Log model hyperparameters and metrics
    mlflow.log_param("num_trees", 50)
    mlflow.log_param("max_depth", 5)
    mlflow.log_metric("test_auc", auc)

    # Log the trained Spark MLlib model
    mlflow.spark.log_model(
        rf_model,
        artifact_path="model",
        registered_model_name="ChurnPredictorV1"
    )

    dt = DeltaTable.forPath(spark, delta_path)
    history_df = dt.history()
    latest_version = history_df.select("version").first()["version"]

    # Log the version used to train this model
    mlflow.set_tag("delta_version", latest_version)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

csv_path = "Downloads/newchurndata.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)

df_renamed = df.select(
    col("Account length").alias("Account_length").cast("int"),
    col("International plan").alias("International_plan"),
    col("Voice mail plan").alias("Voicemail_plan"),
    col("Number vmail messages").alias("Number_vmail_messages").cast("int"),
    col("Total day minutes").alias("Total_day_minutes").cast("double"),
    col("Total eve minutes").alias("Total_eve_minutes").cast("double"),
    col("Total night minutes").alias("Total_night_minutes").cast("double"),
    col("Total intl minutes").alias("Total_intl_minutes").cast("double"),
    col("Customer service calls").alias("Customer_service_calls").cast("int"),
    when(col("Churn") == "Yes", True)
      .when(col("Churn") == "No", False)
      .alias("Churn")
)

delta_path = "/Users/dipankarmazumdar/Downloads/telco_delta"
df_renamed.write.format("delta").mode("append").save(delta_path)

In [None]:
import mlflow
import mlflow.spark
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

# Set experiment
mlflow.set_experiment("/VisionaryRetail/ChurnPrediction")

# Load latest version of Delta table (v2 includes appended data)
df = spark.read.format("delta").load(delta_path)

df = df.withColumn("Churn", col("Churn").cast("string"))

# Prepare label and features
indexer = StringIndexer(inputCol="Churn", outputCol="label")
assembler = VectorAssembler(
    inputCols=[
        "Account_length", "Number_vmail_messages", "Total_day_minutes",
        "Total_eve_minutes", "Total_night_minutes", "Total_intl_minutes",
        "Customer_service_calls"
    ],
    outputCol="features"
)

# Model
rf = RandomForestClassifier(featuresCol="features", labelCol="label")

# Pipeline
pipeline_new = Pipeline(stages=[indexer, assembler, rf])

# Train with MLflow tracking
with mlflow.start_run():
    model_new = pipeline_new.fit(df)
    
    # Log model
    mlflow.spark.log_model(model_new, artifact_path="model", registered_model_name="ChurnPredictorV2")

    # Add a tag with Delta version (latest)
    dt = DeltaTable.forPath(spark, delta_path)
    version = dt.history().select("version").first()["version"]
    mlflow.set_tag("delta_version", version)

    print(f"‚úÖ Model trained on Delta version {version} and registered as 'ChurnPredictorV2'")

In [None]:
# Load Delta version 0 for reproducible scoring
df_v1 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)

df_v1 = df_v1.withColumn("Churn", col("Churn").cast("string"))

# Transform features
df_v1_transformed = model_new.transform(df_v1)

# Show predictions
df_v1_transformed.select("label", "prediction", "probability").show(5)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Step 3: Load Delta version 1 (latest) ‚Äî the one used for training
df_v2 = spark.read.format("delta").option("versionAsOf", 1).load(delta_path)

# Ensure Churn is treated as string (if needed)
df_v2 = df_v2.withColumn("Churn", col("Churn").cast("string"))

# Predict on Delta version 1
df_v2_transformed = model_new.transform(df_v2)

# Evaluate using AUC
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc_v2 = evaluator.evaluate(df_v2_transformed)
print(f"‚úÖ AUC on Delta version 1 (latest training data): {auc_v2:.4f}")

In [None]:
import mlflow

# Step 4: Load experiment runs and compare AUCs
client = mlflow.tracking.MlflowClient()
# experiment = mlflow.get_experiment_by_name("/Users/dipankarmazumdar/ChurnPredictor")
experiment = client.get_experiment_by_name("/VisionaryRetail/ChurnPrediction")

# List latest runs
runs = client.search_runs(experiment_ids=[experiment.experiment_id], order_by=["attributes.start_time DESC"])

for run in runs[:5]:  # show top 5 recent runs
    run_id = run.info.run_id
    tags = run.data.tags
    metrics = run.data.metrics
    print(f"\nüîç Run ID: {run_id}")
    print(f"Model Name: {tags.get('mlflow.log-model.history', 'N/A')}")
    print(f"AUC: {metrics.get('areaUnderROC', 'N/A')}")
    print(f"Delta Table Version: {tags.get('delta_version', 'N/A')}")

In [None]:
new_data_path = "Downloads/new_campaign_customers.csv"

df_market_camp = spark.read.option("header", "true").option("inferSchema", "true").csv(new_data_path)

# Cast Churn to string if it exists (safe fallback)
from pyspark.sql.functions import col
df_market_camp = df_market_camp.withColumn("Churn", col("Churn").cast("string"))

# Write as a new Delta table
scoring_path = "/Users/dipankarmazumdar/Downloads/new_campaign_delta"
df_market_camp.write.format("delta").mode("overwrite").save(scoring_path)


In [None]:
# Load the newly ingested Delta table
df_score = spark.read.format("delta").load(scoring_path)

# Load production model from MLflow
model_uri = "models:/ChurnPredictorV2@prod_model"
prod_model = mlflow.spark.load_model(model_uri)

# Run batch inference
predictions = prod_model.transform(df_score)

# Show output
predictions.select("State", "prediction", "probability").show(10)