In [1]:
import mlflow
import mlflow.sklearn

In [4]:
import dagshub
dagshub.init(repo_owner='Shubhamraut97', repo_name='experemntracking', mlflow=True)

In [2]:
mlflow.set_tracking_uri("https://dagshub.com/Shubhamraut97/experemntracking.mlflow")

In [None]:
mlflow.set_experiment("bog vs tfidf")

2025/07/26 19:38:37 INFO mlflow.tracking.fluent: Experiment with name 'bog vs tfidf' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/ee4afdf38362468f98c2d0c194ce97e4', creation_time=1753538017319, experiment_id='2', last_update_time=1753538017319, lifecycle_stage='active', name='bog vs tfidf', tags={}>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df=pd.read_csv('processed_data.csv').dropna(subset=['clean_comment'])

In [8]:
df.shape

(36661, 2)

In [None]:
import os
import joblib
def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, vectorizer_name):
    if vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    elif vectorizer_type == 'bow':
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        raise ValueError("Invalid vectorizer type. Choose 'tfidf' or 'bow'.")

    X = vectorizer.fit_transform(df['clean_comment'])
    y = df['category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    with mlflow.start_run():
        mlflow.log_param("vectorizer_type", vectorizer_type)
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", vectorizer_max_features)
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")
        mlflow.set_tag("vectorizer_name", vectorizer_name)

        n_estimators = 200
        max_depth = 15
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)

        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 7))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {vectorizer_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plot_path = f'confusion_matrix_{vectorizer_name}.png'
        plt.savefig(plot_path)
        plt.close()
        mlflow.log_artifact(plot_path)
        os.remove(plot_path)

        model_filename = f"{vectorizer_name}_model.pkl"
        joblib.dump(model, model_filename)
        mlflow.log_artifact(model_filename)
        os.remove(model_filename)

        # Optional: save and log processed data
        # data_path = f"processed_data_{vectorizer_name}.csv"
        # df.to_csv(data_path, index=False)
        # mlflow.log_artifact(data_path)
        # os.remove(data_path)

        print(f"✅ Completed: {vectorizer_name} | Accuracy: {accuracy:.4f}")

# Example usage:
n_gram_ranges = [(1, 1), (1, 2), (1, 3)]
max_features = 5000

for ngram_range in n_gram_ranges:
    for vectorizer_type in ['tfidf', 'bow']:
        vec_name = f"{vectorizer_type}_{ngram_range[0]}_{ngram_range[1]}"
        run_experiment(vectorizer_type, ngram_range, max_features, vec_name)

✅ Completed: tfidf_1_1 | Accuracy: 0.6521
🏃 View run righteous-stoat-247 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2/runs/4765aafe88da477290e2f6ee3fdaac46
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2
✅ Completed: bow_1_1 | Accuracy: 0.6513
🏃 View run bustling-toad-17 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2/runs/5878eef467804c19aa58f7199c96483e
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2
✅ Completed: tfidf_1_2 | Accuracy: 0.6557
🏃 View run indecisive-midge-635 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2/runs/3cf750109d6c46d2ac1428569884901e
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2
✅ Completed: bow_1_2 | Accuracy: 0.6542
🏃 View run illustrious-tern-675 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/2