In [7]:
import os
import mlflow
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
df=pd.read_csv('processed_data.csv').dropna(subset=['clean_comment'])

In [3]:
import dagshub
dagshub.init(repo_owner='Shubhamraut97', repo_name='experemntracking', mlflow=True)

In [5]:
mlflow.set_tracking_uri("https://dagshub.com/Shubhamraut97/experemntracking.mlflow")

In [6]:
mlflow.set_experiment("tfidf trigram")

2025/07/26 21:46:49 INFO mlflow.tracking.fluent: Experiment with name 'tfidf trigram' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/a6fbfec9be1b46ad940338fcbf9d36c1', creation_time=1753545709938, experiment_id='3', last_update_time=1753545709938, lifecycle_stage='active', name='tfidf trigram', tags={}>

In [None]:

def runexperiment(max_features):
    ngram_range = (1, 3)
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)


    with mlflow.start_run():
        # Log parameters and tags
        mlflow.log_params({
            "vectorizer": "tfidf",
            "ngram_range": str(ngram_range),
            "max_features": max_features,
            "n_estimators": 200,
            "max_depth": 15
        })
        mlflow.set_tags({
            "experiment_type": "vectorizer_tuning",
            "vectorizer": "tfidf"
        })

        # Model training
        model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log metrics
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)

        # Confusion matrix plot
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plot_path = "confusion_matrix.png"
        plt.savefig(plot_path)
        plt.close()
        mlflow.log_artifact(plot_path)
        os.remove(plot_path)

        # Save and log model
        model_path = f"rf_model_{max_features}.pkl"
        joblib.dump(model, model_path)
        mlflow.log_artifact(model_path)
        os.remove(model_path)

        print(f"✅ Completed: max_features={max_features} | Accuracy: {accuracy:.4f}")

# Run experiments
max_features_list = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
for max_features in max_features_list:
    runexperiment(max_features)


✅ Completed: max_features=1000 | Accuracy: 0.6679
🏃 View run dapper-dog-218 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/3/runs/c367d71bd10f4cd5a54b93c8c3389e91
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/3
✅ Completed: max_features=2000 | Accuracy: 0.6553
🏃 View run capricious-horse-79 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/3/runs/953e011f90394b5db5408f0118f92cde
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/3
✅ Completed: max_features=3000 | Accuracy: 0.6525
🏃 View run monumental-frog-727 at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/3/runs/4ec9255dcdc64a10928a3420f74c09a0
🧪 View experiment at: https://dagshub.com/Shubhamraut97/experemntracking.mlflow/#/experiments/3
✅ Completed: max_features=4000 | Accuracy: 0.6475
🏃 View run respected-turtle-316 at: https://dagshub.com/Shubhamraut97/expere