In [5]:
import os
import mlflow
import mlflow.sklearn
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, length

# --- START: CRITICAL FIX FOR S3/MINIO CONNECTION ---
# Manually set environment variables for Boto3 (used by MLflow) to find MinIO.
# This ensures that MLflow can write model artifacts to the S3 bucket.
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv("AWS_SECRET_ACCESS_KEY")
os.environ['MLFLOW_S3_ENDPOINT_URL'] = f"http://minio:9000"
# --- END: CRITICAL FIX ---


# Define ALL required packages: Kafka, Delta Lake, and S3/Hadoop
required_packages = [
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0",
    "io.delta:delta-spark_2.12:3.2.0",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262"
]
spark_packages = ",".join(required_packages)

# Initialize Spark Session with all required packages
spark = SparkSession.builder \
    .appName("HelpfulnessModelTraining") \
    .config("spark.jars.packages", spark_packages) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Configure MLflow to connect to our running MLflow server
mlflow.set_tracking_uri("http://mlflow:5000")

# Set a less verbose log level for cleaner output
spark.sparkContext.setLogLevel("WARN")

print("✅ Spark Session and MLflow configured for model training.")

✅ Spark Session and MLflow configured for model training.


In [2]:
from pyspark.sql.utils import AnalysisException

silver_path = "s3a://silver/amazon_reviews"
df_model_data = None
record_count = 0

try:
    print(f"📥 Loading data from Silver Delta table: {silver_path}")
    df_silver = spark.read.format("delta").load(silver_path)

    # --- Feature and Target Engineering using Spark ---

    # 1. Create the target variable 'is_helpful'
    # A review is "helpful" if its helpfulness ratio is > 0.75 from at least 5 votes.
    df_with_target = df_silver.withColumn("is_helpful",
        when((col("helpfulness_ratio") > 0.75) & (col("HelpfulnessDenominator") >= 5), 1)
        .otherwise(0)
    )

    # 2. Create a simple feature: the length of the review text
    df_with_features = df_with_target.withColumn("review_length", length(col("Text")))

    # 3. Filter for a reliable training set.
    # We only want to train on reviews that have received enough votes to be judged.
    df_filtered = df_with_features.filter(col("HelpfulnessDenominator") >= 5)

    # 4. Select only the columns we need for the model
    df_model_data = df_filtered.select("Text", "review_length", "Score", "is_helpful")

    # Force an action to see the count and cache the result for faster access later
    record_count = df_model_data.count()
    print(f"📊 Found {record_count} reviews with 5 or more votes available for training.")
    df_model_data.cache()

except AnalysisException as e:
    print(f"❌ ERROR: Could not read from Delta table. {e}")
    print("➡️ Please ensure the streaming ETL notebook has been running for at least 10-15 minutes.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

📥 Loading data from Silver Delta table: s3a://silver/amazon_reviews
📊 Found 21178 reviews with 5 or more votes available for training.


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

if df_model_data and record_count > 50:
    print("Sampling data for training to ensure efficiency...")
    sample_fraction = min(1.0, 20000 / record_count)
    pandas_df = df_model_data.sample(fraction=sample_fraction, seed=42).toPandas()
    print(f"Pandas DataFrame created with {len(pandas_df)} samples.")

    # --- Start ML Workflow ---
    mlflow.set_experiment("Helpfulness_Prediction")
    
    with mlflow.start_run(run_name="LogisticRegression_TFIDF"):
        
        # 1. Define Features (X) and Target (y)
        X = pandas_df[['Text', 'review_length', 'Score']]
        y = pandas_df['is_helpful']
        
        # 2. Split Data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
        
        # 3. Create a Preprocessing Pipeline
        # This handles text vectorization and scaling of numerical features separately.
        preprocessor = ColumnTransformer(
            transformers=[
                ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2)), 'Text'),
                # CORRECTED: Added the list of columns for the scaler to act upon
                ('scaler', StandardScaler(), ['review_length', 'Score'])
            ])

        # 4. Create the Full Model Pipeline
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(random_state=42, class_weight='balanced', C=0.5))
        ])
        
        # 5. Train the Model
        print("🚀 Training model pipeline...")
        model_pipeline.fit(X_train, y_train)
        
        # 6. Evaluate
        print("📈 Evaluating model...")
        y_pred = model_pipeline.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        print(f"  Accuracy:  {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall:    {recall:.4f}")
        print(f"  F1-Score:  {f1:.4f}")
        
        # 7. Log to MLflow
        mlflow.log_param("model_type", "LogisticRegression")
        mlflow.log_param("feature_preprocessor", "TFIDF_and_Scaler")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        mlflow.sklearn.log_model(
            sk_model=model_pipeline,
            artifact_path="helpfulness_model",
            registered_model_name="review-helpfulness-classifier"
        )
        
        print("✅ Model training complete and logged to MLflow.")

else:
    print("⚠️ Not enough data to proceed with model training.")

Sampling data for training to ensure efficiency...
Pandas DataFrame created with 20034 samples.
🚀 Training model pipeline...
📈 Evaluating model...
  Accuracy:  0.7944
  Precision: 0.8613
  Recall:    0.8153
  F1-Score:  0.8377


Successfully registered model 'review-helpfulness-classifier'.
2025/06/19 16:52:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: review-helpfulness-classifier, version 1


✅ Model training complete and logged to MLflow.


Created version '1' of model 'review-helpfulness-classifier'.
