In [None]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Downloading mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.0->mlflow)
  Downloading databricks_sdk-0.35.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Collect

In [None]:
from google.colab import userdata
token=userdata.get('DAGSHUB_PAT')

In [None]:
import os
import mlflow


os.environ["MLFLOW_TRACKING_USERNAME"] = token
os.environ["MLFLOW_TRACKING_PASSWORD"] = token

dagshub_url = "https://dagshub.com"
repo_owner = "Sharad-18"
repo_name = "Mini-Mlops-Project"

# Set up MLflow tracking URI
mlflow.set_tracking_uri("https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow")



In [None]:
# Set or create an experiment
mlflow.set_experiment("Exp 2 - BoW vs TfIdf")

2024/10/22 06:51:30 INFO mlflow.tracking.fluent: Experiment with name 'Exp 2 - BoW vs TfIdf' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1238d60516be48af84e858d4665a560e', creation_time=1729579890384, experiment_id='2', last_update_time=1729579890384, lifecycle_stage='active', name='Exp 2 - BoW vs TfIdf', tags={}>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import numpy as np


In [None]:
df = pd.read_csv('/content/reddit_preprocessing (1).csv')
df.shape

(36793, 2)

In [None]:
df=df.dropna(subset=['clean_comment'])

In [None]:
df.shape

(36662, 2)

In [None]:
from re import X
def run_experiment(vectorizer_type,ngram_range,vectorizer_max_feayures,vectorizer_name):
  if vectorizer_type=="Bow":
    vectorizer=CountVectorizer(max_features=vectorizer_max_feayures,ngram_range=ngram_range)
  else:
    vectorizer=TfidfVectorizer(ngram_range=ngram_range,max_features=vectorizer_max_feayures)
  X_train,X_test,y_train,y_test=train_test_split(df['clean_comment'],df['category'],test_size=0.2,random_state=42,stratify=df['category'])
  X_train=vectorizer.fit_transform(X_train)
  X_test=vectorizer.transform(X_test)

  with mlflow.start_run() as run:
    mlflow.set_tag("mlflow.runName",f"{vectorizer_name}_{ngram_range}_RandomFOrest")
    mlflow.set_tag("experiment_type","featire_engineering")
    mlflow.set_tag("model_type","RandomForest")

    n_estimators=200
    max_depth=15

    mlflow.log_param("n_estimators",n_estimators)
    mlflow.log_param("max_depth",max_depth)

    model=RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth)
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)

    accuracy=accuracy_score(y_test,y_pred=y_pred)
    mlflow.log_metric("accuracy",accuracy)

    classification_rep=classification_report(y_test,y_pred=y_pred,output_dict=True)
    for label,metrics in classification_rep.items():
      if isinstance(metrics,dict):
        for metric,value in metrics.items():
          mlflow.log_metric(f"{label}_{metric}",value)
    conf_metrix=confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(conf_metrix,annot=True,fmt="d",cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.close()
    mlflow.sklearn.log_model(model,f"random_forest_model_{vectorizer_name}_{ngram_range}")
ngram_ranges=[(1,1),(1,2),(1,3)]
max_features=5000
for ngram_range in ngram_ranges:
  run_experiment("Bow",ngram_range,max_features,"BoW")
  run_experiment("TfIdf",ngram_range,max_features,"TfIdf")

2024/10/22 07:49:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run BoW_(1, 1)_RandomFOrest at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2/runs/94b6812f05e94a2b8d9913c7c1bba4a9.
2024/10/22 07:49:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2.
2024/10/22 07:50:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run TfIdf_(1, 1)_RandomFOrest at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2/runs/724eecf8aabe43f193e3282feceeaf96.
2024/10/22 07:50:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2.
2024/10/22 07:50:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run BoW_(1, 2)_RandomFOrest at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments

In [None]:
df = pd.read_csv('/content/reddit_preprocessing (1).csv')
df.shape
df=df.dropna(subset=['clean_comment'])

(36793, 2)

In [None]:
df=df.dropna(subset=['clean_comment'])

In [None]:
# Step 1: Function to run the experiment
def run_experiment_tfidf_max_features(max_features):
    ngram_range = (1, 3)  # Trigram setting

    # Step 2: Vectorization using TF-IDF with varying max_features
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # Step 4: Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"TFIDF_Trigrams_max_features_{max_features}")
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Trigrams, max_features={max_features}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)

        # Step 5: Make predictions and log metrics
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Trigrams, max_features={max_features}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_trigrams_{max_features}")

# Step 6: Test various max_features values
max_features_values = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

for max_features in max_features_values:
    run_experiment_tfidf_max_features(max_features)

2024/10/22 08:03:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run TFIDF_Trigrams_max_features_1000 at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2/runs/dadeb6f92aea4a54841dd690a3b07c62.
2024/10/22 08:03:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2.
2024/10/22 08:04:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run TFIDF_Trigrams_max_features_2000 at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2/runs/00a74e3094ed4f68a8264d59e0d53349.
2024/10/22 08:04:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/2.
2024/10/22 08:04:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run TFIDF_Trigrams_max_features_3000 at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Pl

In [None]:
df = pd.read_csv('/content/reddit_preprocessing (1).csv')
df.shape
df=df.dropna(subset=['clean_comment'])

In [None]:
# Set or create an experiment
mlflow.set_experiment("Exp 4 - Handling Imbalanced Data")

2024/10/22 08:34:23 INFO mlflow.tracking.fluent: Experiment with name 'Exp 4 - Handling Imbalanced Data' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/eb17a0cc395a4800a41ee125a39c49d2', creation_time=1729586063668, experiment_id='3', last_update_time=1729586063668, lifecycle_stage='active', name='Exp 4 - Handling Imbalanced Data', tags={}>

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [None]:
# Step 1: Function to run the experiment
def run_imbalanced_experiment(imbalance_method):
    ngram_range = (1, 3)  # Trigram setting
    max_features = 10000  # Set max_features to 1000 for TF-IDF

    # Step 4: Train-test split before vectorization and resampling
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    # Step 2: Vectorization using TF-IDF, fit on training data only
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
    X_test_vec = vectorizer.transform(X_test)  # Transform test data

    # Step 3: Handle class imbalance based on the selected method (only applied to the training set)
    if imbalance_method == 'class_weights':
        # Use class_weight in Random Forest
        class_weight = 'balanced'
    else:
        class_weight = None  # Do not apply class_weight if using resampling

        # Resampling Techniques (only apply to the training set)
        if imbalance_method == 'oversampling':
            smote = SMOTE(random_state=42)
            X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'adasyn':
            adasyn = ADASYN(random_state=42)
            X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'undersampling':
            rus = RandomUnderSampler(random_state=42)
            X_train_vec, y_train = rus.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'smote_enn':
            smote_enn = SMOTEENN(random_state=42)
            X_train_vec, y_train = smote_enn.fit_resample(X_train_vec, y_train)

    # Step 5: Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"Imbalance_{imbalance_method}_RandomForest_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "imbalance_handling")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Trigrams, imbalance handling method={imbalance_method}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("imbalance_method", imbalance_method)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, class_weight=class_weight)
        model.fit(X_train_vec, y_train)

        # Step 6: Make predictions and log metrics
        y_pred = model.predict(X_test_vec)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Trigrams, Imbalance={imbalance_method}")
        confusion_matrix_filename = f"confusion_matrix_{imbalance_method}.png"
        plt.savefig(confusion_matrix_filename)
        mlflow.log_artifact(confusion_matrix_filename)
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_trigrams_imbalance_{imbalance_method}")

# Step 7: Run experiments for different imbalance methods
imbalance_methods = ['class_weights', 'oversampling', 'adasyn', 'undersampling', 'smote_enn']

for method in imbalance_methods:
    run_imbalanced_experiment(method)


2024/10/22 08:36:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run Imbalance_class_weights_RandomForest_TFIDF_Trigrams at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/3/runs/914a951350ef4849891d5210fd716b6d.
2024/10/22 08:36:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/3.
2024/10/22 08:36:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run Imbalance_oversampling_RandomForest_TFIDF_Trigrams at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/3/runs/a29ff5250b2b4f89a09c944fa538b7df.
2024/10/22 08:36:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Sharad-18/Youtube-comment-analysis-Plugin.mlflow/#/experiments/3.
2024/10/22 08:37:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run Imbalance_adasyn_RandomForest_TFIDF_Trigrams at: https://