# Module 6: PySpark MLlib - Working Demo

This is a simplified working demonstration of the Module 6 machine learning concepts.

In [None]:
# Setup PySpark MLlib Environment
import time
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.regression import *
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *

# Create Spark session
spark = SparkSession.builder \
    .appName("PySpark-MLlib-Demo") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"✅ Spark MLlib ready! Version: {spark.version}")

In [None]:
# Generate Demo Datasets
print("Creating demo datasets...")

# Customer dataset for classification
customer_df = spark.range(1, 1001) \
    .withColumnRenamed("id", "customer_id") \
    .withColumn("age", (rand(42) * 50 + 18).cast("int")) \
    .withColumn("monthly_charges", (rand(44) * 80 + 20).cast("decimal(8,2)")) \
    .withColumn("contract_type", when(rand(46) < 0.5, "Month-to-month").otherwise("One year")) \
    .withColumn("churn", when(rand(55) < 0.3, 1).otherwise(0))

# Sales dataset for regression
sales_df = spark.range(1, 1001) \
    .withColumnRenamed("id", "sale_id") \
    .withColumn("store_id", (rand(56) * 10 + 1).cast("int")) \
    .withColumn("temperature", (rand(59) * 40 + 30).cast("decimal(5,2)")) \
    .withColumn("sales_amount", (rand(66) * 1000 + 500).cast("decimal(10,2)"))

# Product dataset for clustering
product_df = spark.range(1, 501) \
    .withColumnRenamed("id", "product_id") \
    .withColumn("price", (rand(67) * 500 + 10).cast("decimal(8,2)")) \
    .withColumn("rating", (rand(68) * 4 + 1).cast("decimal(3,2)"))

print(f"✅ Customer dataset: {customer_df.count()} records")
print(f"✅ Sales dataset: {sales_df.count()} records")
print(f"✅ Product dataset: {product_df.count()} records")

In [None]:
# Feature Engineering
print("Creating ML-ready feature datasets...")

# Customer features for classification
customer_assembler = VectorAssembler(inputCols=["age", "monthly_charges"], outputCol="features")
customers_features = customer_assembler.transform(customer_df).withColumnRenamed("churn", "churn_label")
customers_features.cache()

# Sales features for regression
sales_assembler = VectorAssembler(inputCols=["store_id", "temperature"], outputCol="features")
sales_features = sales_assembler.transform(sales_df).withColumnRenamed("sales_amount", "total_amount")
sales_features.cache()

# Product features for clustering
products_assembler = VectorAssembler(inputCols=["price", "rating"], outputCol="features")
products_features = products_assembler.transform(product_df)
products_features.cache()

print(f"✅ Customer features: {customers_features.count()} records")
print(f"✅ Sales features: {sales_features.count()} records")
print(f"✅ Product features: {products_features.count()} records")
print("\n✅ All ML datasets ready!")

In [None]:
# Classification Algorithms Demo
print("🔥 CLASSIFICATION ALGORITHMS DEMO")
print("=" * 50)

# Prepare data
train_data, test_data = customers_features.randomSplit([0.8, 0.2], seed=42)
train_data.cache()
test_data.cache()

print(f"Training set: {train_data.count()} records")
print(f"Test set: {test_data.count()} records")

# Test multiple algorithms
classifiers = {
    "Logistic Regression": LogisticRegression(featuresCol="features", labelCol="churn_label", maxIter=10),
    "Random Forest": RandomForestClassifier(featuresCol="features", labelCol="churn_label", numTrees=10),
    "Decision Tree": DecisionTreeClassifier(featuresCol="features", labelCol="churn_label", maxDepth=5)
}

evaluator = BinaryClassificationEvaluator(labelCol="churn_label", rawPredictionCol="rawPrediction")
results = []

for name, classifier in classifiers.items():
    print(f"\n🎯 Testing {name}...")
    start_time = time.time()
    
    model = classifier.fit(train_data)
    predictions = model.transform(test_data)
    auc = evaluator.evaluate(predictions)
    training_time = time.time() - start_time
    
    results.append({
        "Algorithm": name,
        "AUC": round(auc, 4),
        "Training Time": round(training_time, 2)
    })
    
    print(f"   AUC: {auc:.4f}")
    print(f"   Training Time: {training_time:.2f}s")

# Summary
print("\n📊 CLASSIFICATION RESULTS:")
results_df = spark.createDataFrame(results)
results_df.show(truncate=False)

best = max(results, key=lambda x: x["AUC"])
print(f"🏆 Best performer: {best['Algorithm']} (AUC: {best['AUC']})")

In [None]:
# Regression Algorithms Demo
print("\n🔥 REGRESSION ALGORITHMS DEMO")
print("=" * 50)

# Prepare data
sales_train, sales_test = sales_features.randomSplit([0.8, 0.2], seed=42)
sales_train.cache()
sales_test.cache()

print(f"Training set: {sales_train.count()} records")
print(f"Test set: {sales_test.count()} records")

# Test multiple algorithms
regressors = {
    "Linear Regression": LinearRegression(featuresCol="features", labelCol="total_amount", maxIter=10),
    "Random Forest": RandomForestRegressor(featuresCol="features", labelCol="total_amount", numTrees=10),
    "Decision Tree": DecisionTreeRegressor(featuresCol="features", labelCol="total_amount", maxDepth=5)
}

reg_evaluator = RegressionEvaluator(labelCol="total_amount", predictionCol="prediction", metricName="rmse")
reg_results = []

for name, regressor in regressors.items():
    print(f"\n🎯 Testing {name}...")
    start_time = time.time()
    
    model = regressor.fit(sales_train)
    predictions = model.transform(sales_test)
    rmse = reg_evaluator.evaluate(predictions)
    training_time = time.time() - start_time
    
    reg_results.append({
        "Algorithm": name,
        "RMSE": round(rmse, 2),
        "Training Time": round(training_time, 2)
    })
    
    print(f"   RMSE: {rmse:.2f}")
    print(f"   Training Time: {training_time:.2f}s")

# Summary
print("\n📊 REGRESSION RESULTS:")
reg_results_df = spark.createDataFrame(reg_results)
reg_results_df.show(truncate=False)

best_reg = min(reg_results, key=lambda x: x["RMSE"])
print(f"🏆 Best performer: {best_reg['Algorithm']} (RMSE: {best_reg['RMSE']})")

In [None]:
# Clustering Algorithms Demo
print("\n🔥 CLUSTERING ALGORITHMS DEMO")
print("=" * 50)

# Test multiple algorithms
clusterers = {
    "K-Means": KMeans(featuresCol="features", k=3, seed=42),
    "Gaussian Mixture": GaussianMixture(featuresCol="features", k=3, seed=42)
}

cluster_evaluator = ClusteringEvaluator(predictionCol="prediction", featuresCol="features")
cluster_results = []

for name, clusterer in clusterers.items():
    print(f"\n🎯 Testing {name}...")
    start_time = time.time()
    
    model = clusterer.fit(products_features)
    predictions = model.transform(products_features)
    silhouette = cluster_evaluator.evaluate(predictions)
    training_time = time.time() - start_time
    
    cluster_results.append({
        "Algorithm": name,
        "Silhouette Score": round(silhouette, 4),
        "Training Time": round(training_time, 2)
    })
    
    print(f"   Silhouette Score: {silhouette:.4f}")
    print(f"   Training Time: {training_time:.2f}s")
    
    # Show cluster distribution
    cluster_counts = predictions.groupBy("prediction").count().orderBy("prediction")
    print("   Cluster Distribution:")
    cluster_counts.show()

# Summary
print("\n📊 CLUSTERING RESULTS:")
cluster_results_df = spark.createDataFrame(cluster_results)
cluster_results_df.show(truncate=False)

best_cluster = max(cluster_results, key=lambda x: x["Silhouette Score"])
print(f"🏆 Best performer: {best_cluster['Algorithm']} (Silhouette: {best_cluster['Silhouette Score']})")

In [None]:
# ML Pipeline Demo
print("\n🔥 ML PIPELINE DEMO")
print("=" * 50)

# Create a complete pipeline
indexer = StringIndexer(inputCol="contract_type", outputCol="contract_indexed")
encoder = OneHotEncoder(inputCol="contract_indexed", outputCol="contract_encoded")
assembler = VectorAssembler(inputCols=["age", "monthly_charges", "contract_encoded"], outputCol="pipeline_features")
classifier = LogisticRegression(featuresCol="pipeline_features", labelCol="churn", maxIter=10)

pipeline = Pipeline(stages=[indexer, encoder, assembler, classifier])

print("Pipeline stages:")
for i, stage in enumerate(pipeline.getStages()):
    print(f"  {i+1}. {type(stage).__name__}")

# Train pipeline
pipeline_train, pipeline_test = customer_df.randomSplit([0.8, 0.2], seed=42)
pipeline_model = pipeline.fit(pipeline_train)
pipeline_predictions = pipeline_model.transform(pipeline_test)

# Evaluate pipeline
pipeline_evaluator = BinaryClassificationEvaluator(labelCol="churn", rawPredictionCol="rawPrediction")
pipeline_auc = pipeline_evaluator.evaluate(pipeline_predictions)

print(f"\n✅ Pipeline AUC: {pipeline_auc:.4f}")
print("\n📋 Sample predictions:")
pipeline_predictions.select("customer_id", "age", "contract_type", "churn", "prediction", "probability").show(5)

## 🎉 Module 6 Complete!

### ✅ What We Demonstrated:

1. **Feature Engineering**: Vector assembly and data preparation
2. **Classification**: Logistic Regression, Random Forest, Decision Tree
3. **Regression**: Linear Regression, Random Forest, Decision Tree
4. **Clustering**: K-Means, Gaussian Mixture Model
5. **ML Pipelines**: End-to-end workflow automation

### 🚀 Key Takeaways:

- **PySpark MLlib** provides comprehensive machine learning capabilities
- **Feature engineering** is crucial for model performance
- **Multiple algorithms** can be compared easily
- **Pipelines** enable production-ready ML workflows
- **Evaluation metrics** help select the best models

### 📈 Next Steps:

- **Model Tuning**: Hyperparameter optimization
- **Feature Selection**: Advanced feature engineering
- **Model Persistence**: Saving and loading models
- **Production Deployment**: Real-world deployment strategies