## 1. Setup and Dependencies

### 1.1 Install Libraries

In [1]:
# Install necessary libraries for MLflow and S3 artifact storage
%pip install -q mlflow boto3 awscli

Note: you may need to restart the kernel to use updated packages.




### 1.3 Import Libraries

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

## 2. MLflow Configuration

In [3]:
# Set the remote tracking server URI
mlflow.set_tracking_uri("http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/")

# Set or create a new experiment
mlflow.set_experiment("Experiment 2 - BoW vs TfIdf")

<Experiment: artifact_location='s3://mlfow-bucket-2025/111723537539582833', creation_time=1762929091542, experiment_id='111723537539582833', last_update_time=1762929091542, lifecycle_stage='active', name='Experiment 2 - BoW vs TfIdf', tags={'mlflow.experimentKind': 'custom_model_development'}>

## 3. Data Loading and Preparation

In [8]:
df = pd.read_csv('../data/reddit_preprocessing.csv').dropna(subset=['clean_comment'])
# Display the shape after final cleaning
df.shape

(36662, 2)

In [9]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [10]:
print(df.isnull().sum())


clean_comment    0
category         0
dtype: int64


In [16]:
class_counts = df['category'].value_counts()
print("Class Distribution:")
print(class_counts)

# The minority class is the one with the lowest count
minority_class = class_counts.index[-1]
minority_count = class_counts.min()

print(f"\nMinority Class: {minority_class} (Count: {minority_count})")

Class Distribution:
category
 1    15770
 0    12644
-1     8248
Name: count, dtype: int64

Minority Class: -1 (Count: 8248)


## 4. Experiment Function Definition

In [11]:
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
    """Runs a single experiment, logs parameters and metrics to MLflow."""
    
    # 1. Vectorizer Initialization
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    # 2. Data Split (Using original DataFrame to ensure consistent splitting)
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], 
                                                              test_size=0.2, random_state=42, stratify=df['category'])

    # 3. Vectorization
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    # 4. MLflow Run
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with {vectorizer_name}, ngram_range={ngram_range}, max_features={vectorizer_max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train_vec, y_train)

        # 5. Make predictions and log metrics
        y_pred = model.predict(X_test_vec)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report details
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # 6. Log confusion matrix plot
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close() # Close plot to free memory

        # 7. Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_{vectorizer_name}_{ngram_range}")

        print(f"Completed run: {vectorizer_name} with {ngram_range} n-gram range. Accuracy: {accuracy:.4f}")

## 5. Execute Experiments

In [12]:
# Define the experiment space
ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # Unigrams, Bigrams, Trigrams
max_features = 5000  # Limiting feature size for all experiments

for ngram_range in ngram_ranges:
    print(f"\n--- Running n-gram range: {ngram_range} ---")
    
    # BoW Experiments
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")

    # TF-IDF Experiments
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")


--- Running n-gram range: (1, 1) ---




Completed run: BoW with (1, 1) n-gram range. Accuracy: 0.6460
üèÉ View run BoW_(1, 1)_RandomForest at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833/runs/b0897df6b2da45e6b73a2bd46ae3ed50
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833




Completed run: TF-IDF with (1, 1) n-gram range. Accuracy: 0.6443
üèÉ View run TF-IDF_(1, 1)_RandomForest at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833/runs/dab936c124754c94b241a329291cfd07
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833

--- Running n-gram range: (1, 2) ---




Completed run: BoW with (1, 2) n-gram range. Accuracy: 0.6486
üèÉ View run BoW_(1, 2)_RandomForest at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833/runs/4782585b428741438797719ec431376b
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833




Completed run: TF-IDF with (1, 2) n-gram range. Accuracy: 0.6529
üèÉ View run TF-IDF_(1, 2)_RandomForest at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833/runs/6418294871a045eda06b2effbbc788ae
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833

--- Running n-gram range: (1, 3) ---




Completed run: BoW with (1, 3) n-gram range. Accuracy: 0.6469
üèÉ View run BoW_(1, 3)_RandomForest at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833/runs/1fdc6c29eed2495abdcce7e308714084
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833




Completed run: TF-IDF with (1, 3) n-gram range. Accuracy: 0.6471
üèÉ View run TF-IDF_(1, 3)_RandomForest at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833/runs/61cffb6d480240ebbf5933598b87e3ec
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/111723537539582833


## 6. Conclusion
Review the results in the MLflow UI (using the tracking URI provided) to compare the performance metrics (accuracy, precision, recall, f1-score) logged for each combination of vectorizer type and n-gram range.

In [17]:
import ast # Needed to safely evaluate the string representation of the tuple from MLflow


runs = mlflow.search_runs(
    filter_string="tags.experiment_type = 'feature_engineering'"
)

comparison_data = []

# Iterate through the retrieved runs
for _, run in runs.iterrows():
    run_id = run['run_id']
    
    # Extract parameters
    vectorizer_type = run['params.vectorizer_type']
    vectorizer_max_features = run['params.vectorizer_max_features']
    
    # Safely convert the string tuple representation back to a tuple for labeling
    ngram_range_str = run['params.ngram_range']
    try:
        # Use ast.literal_eval for safe conversion from string to tuple
        ngram_range = ast.literal_eval(ngram_range_str)
    except (ValueError, SyntaxError):
        # Fallback if the string format is unexpected
        ngram_range = ngram_range_str 
    
    # Extract metrics
    # Note: Using .get() ensures the script doesn't crash if a metric is missing, 
    # but since the experiment logs them all, they should be present.
    row = {
        'Vectorizer': vectorizer_type,
        'N-gram Range': str(ngram_range),
        'Max Features': vectorizer_max_features,
        'Accuracy (Overall)': run['metrics.accuracy'],
        # Metrics for the Minority Class (-1)
        '-1_Precision': run.get('metrics.-1_precision', 0.0),
        '-1_Recall': run.get('metrics.-1_recall', 0.0),
        '-1_F1-Score': run.get('metrics.-1_f1-score', 0.0),
        'Run ID': run_id
    }
    comparison_data.append(row)

df_comparison = pd.DataFrame(comparison_data)

df_comparison = df_comparison[['Vectorizer', 'N-gram Range', 'Max Features', 'Accuracy (Overall)', '-1_Precision', '-1_Recall', '-1_F1-Score', 'Run ID']]

df_comparison = df_comparison.sort_values(by='-1_F1-Score', ascending=False)


print("--- Detailed Comparison of Feature Engineering Methods (Sorted by -1_F1-Score) ---")
print(df_comparison.to_markdown(index=False, floatfmt=".4f"))

if not df_comparison.empty:
    best_method_row = df_comparison.iloc[0]
    print("\n" + "="*80)
    print("ü•á Best Feature Engineering Combination (Based on Minority Class F1-Score):")
    print(f"Vectorizer: {best_method_row['Vectorizer']}")
    print(f"N-gram Range: {best_method_row['N-gram Range']}")
    print(f"Max Features: {best_method_row['Max Features']}")
    print(f"F1-Score (Minority Class): {best_method_row['-1_F1-Score']:.4f}")
    print("Rationale: This combination offers the best balance between Precision and Recall for the critical minority class, which is vital for effective comment moderation.")
    print("="*80)
else:
    print("\nNo MLflow runs found for 'feature_engineering' experiment type.")

--- Detailed Comparison of Feature Engineering Methods (Sorted by -1_F1-Score) ---
| Vectorizer   | N-gram Range   |   Max Features |   Accuracy (Overall) |   -1_Precision |   -1_Recall |   -1_F1-Score | Run ID                           |
|:-------------|:---------------|---------------:|---------------------:|---------------:|------------:|--------------:|:---------------------------------|
| TF-IDF       | (1, 2)         |           5000 |               0.6529 |         0.9216 |      0.0285 |        0.0553 | 6418294871a045eda06b2effbbc788ae |
| BoW          | (1, 2)         |           5000 |               0.6486 |         0.9737 |      0.0224 |        0.0438 | 4782585b428741438797719ec431376b |
| TF-IDF       | (1, 3)         |           5000 |               0.6471 |         0.9714 |      0.0206 |        0.0404 | 61cffb6d480240ebbf5933598b87e3ec |
| BoW          | (1, 3)         |           5000 |               0.6469 |         0.9643 |      0.0164 |        0.0322 | 1fdc6c29eed2495a

### üìù Experiment Summary & Recommendations
---

#### ü•á Best Balanced Method
- **Technique:** Undersampling  
- **F1-Score (Minority Class):** 0.5257  
- **Recall (Minority Class):** 0.4733  
- **Rationale:** Undersampling provides the highest F1-Score (best balance between Precision and Recall), making it the most reliable technique for generalized comment analysis, minimizing both missed toxic comments and false alarms.

---

#### ü•á Best Feature Engineering Combination (Based on Minority Class F1-Score)
- **Vectorizer:** TF-IDF  
- **N-gram Range:** (1, 2)  
- **Max Features:** 5000  
- **F1-Score (Minority Class):** 0.0553  
- **Rationale:** This combination offers the best balance between Precision and Recall for the critical minority class, which is vital for effective comment moderation.
"""

#### ü•á Optimal Feature Tuning (Based on Minority Class F1-Score)
- **Vectorization Method:** TF-IDF Bigrams  
- **Optimal Max Features:** 1000  
- **F1-Score (Minority Class):** 0.2496  
- **Rationale:** This feature count maximizes the model's ability to distinguish the critical minority class.

---



