In [None]:
!pip install mlflow dagshub  lightgbm

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting dagshub
  Downloading dagshub-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting open

In [None]:
import mlflow
import dagshub

dagshub.init(repo_owner='Shrijeet14', repo_name='Comment-Analyzer', mlflow=True)

mlflow.set_tracking_uri("https://dagshub.com/Shrijeet14/Comment-Analyzer.mlflow/")



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=2d4ff6da-5ef9-4039-a76a-cc73603d944c&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=1cfff7f9ee60de05b7dea635884303b967605eab4b91bf88df1b8ffe23c373f5




Output()

In [None]:
mlflow.set_experiment("Final_Deployment_Model_Training")

2025/06/22 23:17:13 INFO mlflow.tracking.fluent: Experiment with name 'Final_Deployment_Model_Training' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/bfb2fd0f9a4442ec85b24b00009ba74e', creation_time=1750634233478, experiment_id='7', last_update_time=1750634233478, lifecycle_stage='active', name='Final_Deployment_Model_Training', tags={}>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
import mlflow
import mlflow.sklearn
import dagshub
import joblib
from tqdm import tqdm
import warnings
import os

warnings.filterwarnings("ignore")  # Clean output

In [None]:
# Load and clean data
dataset = pd.read_csv('/content/reddit_preprocessing.csv')
cleaned_dataset = dataset.dropna()

X = cleaned_dataset['clean_comment']
y = cleaned_dataset['category']

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# TF-IDF vectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Define base models
lightgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,
    reg_lambda=0.1,
    learning_rate=0.08081298097796712,
    n_estimators=367,
    max_depth=20,
    verbose=-1  # No console output; controlled via tqdm
)
logreg_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', multi_class='multinomial')

In [None]:
# Define meta learner model
knn_meta_learner = KNeighborsClassifier(n_neighbors=5)

In [None]:
# Final stacking model
stacking_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('logistic_regression', logreg_model)
    ],
    final_estimator=knn_meta_learner,
    cv=5
)

In [None]:
# MLflow tracking
with mlflow.start_run(run_name="Stacking_Model_Run"):
    mlflow.log_param("tfidf_ngram_range", (1, 3))
    mlflow.log_param("tfidf_max_features", 10000)
    mlflow.log_param("lightgbm_max_depth", 20)
    mlflow.log_param("lightgbm_learning_rate", 0.0808)
    mlflow.log_param("lightgbm_n_estimators", 367)
    mlflow.log_param("logreg_max_iter", 1000)
    mlflow.log_param("knn_n_neighbors", 5)

    # TQDM tracking for training (progress bar for simulation)
    print("Training Stacking Model...")
    for _ in tqdm(range(1), desc="Fitting model"):  # Only one iteration to wrap the `.fit()` call
        stacking_model.fit(X_train_tfidf, y_train)

    # Prediction
    y_pred = stacking_model.predict(X_test_tfidf)

    # Metrics logging
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score_weighted", f1)

    # Save classification report
    report = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt")

    # Save vectorizer and model
    joblib.dump(tfidf, "tfidf_vectorizer.pkl")
    mlflow.log_artifact("tfidf_vectorizer.pkl")

    joblib.dump(stacking_model, "Final_Model.pkl")
    mlflow.log_artifact("Final_Model.pkl")

    # Log used dataset
    cleaned_dataset.to_csv("used_dataset.csv", index=False)
    mlflow.log_artifact("used_dataset.csv")

print(report)

Training Stacking Model...


Fitting model: 100%|██████████| 1/1 [07:15<00:00, 435.48s/it]


🏃 View run Stacking_Model_Run at: https://dagshub.com/Shrijeet14/Comment-Analyzer.mlflow/#/experiments/7/runs/0a71a91f7fa6422ea93f4a0993d925bf
🧪 View experiment at: https://dagshub.com/Shrijeet14/Comment-Analyzer.mlflow/#/experiments/7
              precision    recall  f1-score   support

          -1       0.79      0.77      0.78      1647
           0       0.86      0.95      0.90      2510
           1       0.90      0.84      0.87      3176

    accuracy                           0.86      7333
   macro avg       0.85      0.86      0.85      7333
weighted avg       0.86      0.86      0.86      7333



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

# Load the dataset
dataset = pd.read_csv('/content/reddit_preprocessing.csv')

# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = dataset.dropna()

# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Apply TfidfVectorizer with trigram setting and max_features=10000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

# Base learners
lightgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.1,  # L2 regularization,
    learning_rate=0.08081298097796712,
    n_estimators=367,
    max_depth=20
)

logreg_model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', multi_class='multinomial')

# Meta-learner
knn_meta_learner = KNeighborsClassifier(n_neighbors=5)

# Create the StackingClassifier with LightGBM and LogisticRegression as base models, and KNN as meta-learner
stacking_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('logistic_regression', logreg_model)
    ],
    final_estimator=knn_meta_learner,
    cv=5
)

# Train the stacking model
stacking_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

# Make predictions on the test data
y_pred = stacking_model.predict(X_test_tfidf_cleaned)

# Generate classification report
print(classification_report(y_test_cleaned, y_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.485257 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 131883
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 4437
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.843192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 103239
[LightGBM] [Info] Number of data points in the train set: 23463, number of used features: 3613
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.790585 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 103297
[LightGBM] [Info] Number of data points in the train set: 23463, number of used features: 3618
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-cho



              precision    recall  f1-score   support

          -1       0.79      0.78      0.79      1647
           0       0.86      0.95      0.91      2510
           1       0.91      0.84      0.88      3176

    accuracy                           0.87      7333
   macro avg       0.86      0.86      0.86      7333
weighted avg       0.87      0.87      0.87      7333

