In [1]:
import os

In [None]:
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''
os.environ['AWS_DEFAULT_REGION'] = 'ap-southeast-2'

In [3]:
import mlflow
mlflow.set_tracking_uri("http://ec2-13-211-98-126.ap-southeast-2.compute.amazonaws.com:5000/")
mlflow.set_experiment("Exp-2_BOW vs TFIDG")

<Experiment: artifact_location='s3://sentiment-mlops-bucket-s3/269730053283748429', creation_time=1760669506403, experiment_id='269730053283748429', last_update_time=1760669506403, lifecycle_stage='active', name='Exp-2_BOW vs TFIDG', tags={}>

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('cleaned_dataset.csv')

In [6]:
df[df.isna().any(axis = 1)]

Unnamed: 0,clean_comment,category


In [7]:
df.iloc[287,:]

clean_comment    sonia gandhi acolyte amartya sen independent e...
category                                                         0
Name: 287, dtype: object

In [12]:
def run_experiment( ngram_range, vectorizer_max_features, vectorizer_name):
    if vectorizer_name == 'BoW':
        vectorizer = CountVectorizer(ngram_range=ngram_range, 
                                     max_features= vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, 
                                     max_features=vectorizer_max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'],
                                                        df['category'], 
                                                        test_size=0.2, 
                                                        random_state=42,
                                                        stratify=df['category'])
    
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        mlflow.set_tag("description", f"Random Forest with {vectorizer_name}, ngram_range = {ngram_range}, max_features = {vectorizer_max_features}")

        # Log vectorizer parameters
        
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)
        
        # Set model parameters
        n_estimators = 200
        max_depth = 15
        
        # Log model parameters
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        model = RandomForestClassifier(n_estimators=n_estimators,
                                       max_depth= max_depth,
                                       random_state= 42)
        model.fit(X_train, y_train)

        y_pred_test = model.predict(X_test)
        y_pred_train = model.predict(X_train)

        accuracy_test = accuracy_score(y_pred_test, y_test)
        accuracy_train = accuracy_score(y_pred_train, y_train)

        # Log Accuracy
        mlflow.log_metric("accuracy_test", accuracy_test)
        mlflow.log_metric("accuracy_train", accuracy_train)

        # Log classfication report
        classification_rep = classification_report(y_test, y_pred_test, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)
        
        # Log confusion metrix
        conf_matrix = confusion_matrix(y_test, y_pred_test)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot = True, fmt = "d", cmap = "Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion matrix : {vectorizer_name}, {ngram_range}")
        plt.savefig(f"confusion_matrix_{vectorizer_name}_{ngram_range}.png")
        mlflow.log_artifact(f"confusion_matrix_{vectorizer_name}_{ngram_range}.png")
        plt.close()

        mlflow.sklearn.log_model(model, name = f"random_forest_model_{vectorizer_name}_{ngram_range}", registered_model_name = f"reg_random_forest_model_{vectorizer_name}_{ngram_range}")


# Run the experiments for BoW and TFIDF with different ngram values
# ngram_ranges = [(1, 1), (1, 2), (1, 3)]
ngram_ranges = [(1,3)]
max_features = 5000

for ngram_range in ngram_ranges:
    run_experiment(ngram_range, max_features, "BoW")

    run_experiment(ngram_range, max_features, "TF-IDF")



Successfully registered model 'reg_random_forest_model_BoW_(1, 3)'.
2025/10/23 14:26:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: reg_random_forest_model_BoW_(1, 3), version 1
Created version '1' of model 'reg_random_forest_model_BoW_(1, 3)'.


🏃 View run BoW_(1, 3)_RandomForest at: http://ec2-13-211-98-126.ap-southeast-2.compute.amazonaws.com:5000/#/experiments/269730053283748429/runs/c9d5d66dc6f94c1b85633685d7970037
🧪 View experiment at: http://ec2-13-211-98-126.ap-southeast-2.compute.amazonaws.com:5000/#/experiments/269730053283748429


Successfully registered model 'reg_random_forest_model_TF-IDF_(1, 3)'.
2025/10/23 14:28:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: reg_random_forest_model_TF-IDF_(1, 3), version 1
Created version '1' of model 'reg_random_forest_model_TF-IDF_(1, 3)'.


🏃 View run TF-IDF_(1, 3)_RandomForest at: http://ec2-13-211-98-126.ap-southeast-2.compute.amazonaws.com:5000/#/experiments/269730053283748429/runs/fe3f9c71fd9340619adcf181cecd9104
🧪 View experiment at: http://ec2-13-211-98-126.ap-southeast-2.compute.amazonaws.com:5000/#/experiments/269730053283748429


In [10]:
mlflow.__version__

'3.4.0'

In [None]:
# Choosing tfidf(1, 2) as the vectorizer of choice