In [1]:
!pip install mlflow

Collecting mlflow
  Using cached mlflow-3.4.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.4.0 (from mlflow)
  Using cached mlflow_skinny-3.4.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.4.0 (from mlflow)
  Using cached mlflow_tracing-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting cryptography<46,>=43.0.0 (from mlflow)
  Using cached cryptography-45.0.7-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting fastmcp<3,>=2.0.0 (from mlflow)
  Using cached fastmcp-2.12.4-py3-none-any.whl.metadata (19 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting cloudpickle<4 (from mlflow-skinny==3.4.0->mlflow)
  Using cached cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.4.0->mlflow)
  Using cached databricks_sdk-0.68.0-py3-

  You can safely remove it manually.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
import warnings

# Suppress warnings for cleaner notebook output
warnings.filterwarnings("ignore")

# --- 1. Load Data and Preparation ---
file_name = "TMKOC_Cleaned_FeatureEngineered_Final_V3.csv"
df = pd.read_csv(file_name)

print("--- Experiment 4: ML Modeling and Experiment Tracking ---")
print(f"Data shape: {df.shape}")

# Define features (X) and target (y)
# Drop date columns and the cleaned place_matches column as they aren't used directly
X = df.drop(columns=['View_Class', 'Released_on', 'place_matches_cleaned'])
y = df['View_Class']

# Define feature types for the ColumnTransformer
# Numerical features (to be scaled)
numerical_features = ['Text_Length', 'Sentiment_Score', 'Engagement_Ratio', 'Like_Ratio']

# Categorical features (to be one-hot encoded)
# Day_of_Week is treated as categorical (0-6)
categorical_features = ['Day_of_Week']

# Binary/High-Cardinality Binary features (cast_ features - no transformation needed)
binary_features = [col for col in X.columns if col.startswith('cast_')]

# Combine feature lists, ensuring no duplicates
all_features = numerical_features + categorical_features + binary_features
X = X[all_features]

# --- 2. Data Splitting ---
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train/Test split: {X_train.shape[0]} training samples, {X_test.shape[0]} testing samples.")

# --- 3. Preprocessing Pipeline ---
# Create transformers for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep binary (cast) features as is
)

# Apply preprocessing to training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


# --- 4. MLflow Experiment Setup and Baseline Training ---
mlflow.set_experiment("TMKOC_Popularity_Prediction")

baseline_models = {
    "Gaussian Naive Bayes (GNB)": GaussianNB(),
    "Decision Tree Classifier (DTC)": DecisionTreeClassifier(random_state=42),
    "Random Forest Classifier (RFC)": RandomForestClassifier(random_state=42, n_estimators=100)
}

baseline_results = {}

print("\n--- Training Baseline Models (MLflow Tracking) ---")
for name, model in baseline_models.items():
    with mlflow.start_run(run_name=f"Baseline_{name}") as run:
        print(f"Starting MLflow run for: {name}")

        # Train model
        model.fit(X_train_processed, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test_processed)
        accuracy = accuracy_score(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_param("model_name", name)
        mlflow.log_metric("accuracy", accuracy)

        # Log the classification report (as an artifact)
        report = classification_report(y_test, y_pred, output_dict=True)
        mlflow.log_dict(report, "classification_report.json")
        
        # Log the model (artifact)
        mlflow.sklearn.log_model(model, "model")

        baseline_results[name] = accuracy
        print(f"Model: {name} | Accuracy: {accuracy:.4f} | MLflow Run ID: {run.info.run_id}")


# --- 5. Hyperparameter Tuning on Best Baseline Model (Random Forest) ---
best_baseline_model_name = max(baseline_results, key=baseline_results.get)
print(f"\n--- Best Baseline Model: {best_baseline_model_name} (Accuracy: {baseline_results[best_baseline_model_name]:.4f}) ---")
print("Starting Hyperparameter Tuning using GridSearchCV on Random Forest...")

# Hyperparameter grid for Random Forest Classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Use the best baseline model (Random Forest)
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=0,
    n_jobs=-1
)

# Fit Grid Search
grid_search.fit(X_train_processed, y_train)

# Get the best estimator and predict
best_rf_model = grid_search.best_estimator_
y_pred_tuned = best_rf_model.predict(X_test_processed)
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)

mlflow.set_tracking_uri("http://localhost:5000")

# --- 6. MLflow Tracking for Tuned Model and Final Selection ---
with mlflow.start_run(run_name="Tuned_Random_Forest_Final") as run:
    print(f"\nStarting MLflow run for: Tuned Random Forest")
    
    # Log tuned parameters
    mlflow.log_param("model_name", "Random Forest Tuned")
    mlflow.log_params(grid_search.best_params_)
    
    # Log tuned metric
    mlflow.log_metric("accuracy", tuned_accuracy)
    
    # Log the final classification report
    report_tuned = classification_report(y_test, y_pred_tuned, output_dict=True)
    mlflow.log_dict(report_tuned, "final_classification_report.json")

    # Log the final model artifact
    mlflow.sklearn.log_model(best_rf_model, "final_model_artifact")
    
    print(f"Tuned Random Forest Accuracy: {tuned_accuracy:.4f}")
    print(f"MLflow Run ID for Final Model: {run.info.run_id}")
    print(f"Best Hyperparameters: {grid_search.best_params_}")

# --- 7. Model Selection and Saving (Serialize Preprocessor and Model) ---
# The best model is the Tuned Random Forest
final_model = best_rf_model

# Save the model and preprocessor for deployment
import joblib
joblib.dump(final_model, 'final_tmkoc_classifier.pkl')
joblib.dump(preprocessor, 'final_tmkoc_preprocessor.pkl')

print("\n--- Final Deliverables Saved ---")
print("1. final_tmkoc_classifier.pkl (Best Model)")
print("2. final_tmkoc_preprocessor.pkl (Required for new data inference)")
print("3. All runs logged to MLflow.")


2025/10/15 00:37:13 INFO mlflow.tracking.fluent: Experiment with name 'TMKOC_Popularity_Prediction' does not exist. Creating a new experiment.


--- Experiment 4: ML Modeling and Experiment Tracking ---
Data shape: (1981, 27)
Train/Test split: 1584 training samples, 397 testing samples.

--- Training Baseline Models (MLflow Tracking) ---




Starting MLflow run for: Gaussian Naive Bayes (GNB)




Model: Gaussian Naive Bayes (GNB) | Accuracy: 0.4736 | MLflow Run ID: 9f859edbd0a142adab7b46c93fd7dd6a
Starting MLflow run for: Decision Tree Classifier (DTC)




Model: Decision Tree Classifier (DTC) | Accuracy: 0.4937 | MLflow Run ID: b25fe55f1eaa4f6f88e272c61e523c43
Starting MLflow run for: Random Forest Classifier (RFC)




Model: Random Forest Classifier (RFC) | Accuracy: 0.5567 | MLflow Run ID: 886bffef667c4373a6a8fd9079b1733f

--- Best Baseline Model: Random Forest Classifier (RFC) (Accuracy: 0.5567) ---
Starting Hyperparameter Tuning using GridSearchCV on Random Forest...


RestException: RESOURCE_DOES_NOT_EXIST: Could not find experiment with ID 975151349388830101