In [None]:
!pip install mlflow dagshub  lightgbm

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting dagshub
  Downloading dagshub-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting open

In [None]:
import mlflow
import dagshub

dagshub.init(repo_owner='Shrijeet14', repo_name='Comment-Analyzer', mlflow=True)

mlflow.set_tracking_uri("https://dagshub.com/Shrijeet14/Comment-Analyzer.mlflow/")

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=2bfefeea-0efe-489d-98ae-cf8d7067fddf&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=eb643f74feb2fbcc53887fae0c158900ec66457170587913c1b3d888413923c9




In [None]:
mlflow.set_experiment("V2_Final_Deployment_Model_Training")

2025/06/22 23:34:16 INFO mlflow.tracking.fluent: Experiment with name 'V2_Final_Deployment_Model_Training' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/3e19fdbdb4b14a2b9eebe3b5c9504940', creation_time=1750635256707, experiment_id='8', last_update_time=1750635256707, lifecycle_stage='active', name='V2_Final_Deployment_Model_Training', tags={}>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tqdm import tqdm
import mlflow
import mlflow.sklearn
import dagshub
import openai
import joblib
import os
import time
import warnings

warnings.filterwarnings("ignore")

# Set your OpenAI API key
openai.api_key = "Your api key"

In [None]:
# Load and clean data
dataset = pd.read_csv('/content/reddit_preprocessing.csv')
cleaned_dataset = dataset.dropna()
X = cleaned_dataset['clean_comment'].tolist()
y = cleaned_dataset['category']

# Function to get OpenAI embeddings in batch
def get_openai_embeddings_batch(text_batch, model="text-embedding-3-small"):
    try:
        response = openai.embeddings.create(input=text_batch, model=model)
        return [np.array(item.embedding) for item in response.data]
    except Exception as e:
        print(" Error during batch embedding:", e)
        return None

# Generate embeddings with batching
batch_size = 100
X_embeddings = []

print(" Generating OpenAI embeddings in batches...")
for i in tqdm(range(0, len(X), batch_size), desc="Embedding Batches"):
    batch_texts = X[i:i + batch_size]
    embeddings = get_openai_embeddings_batch(batch_texts)
    if embeddings is not None:
        X_embeddings.extend(embeddings)
    else:
        print(f" Skipping batch {i}-{i + batch_size} due to error")
    time.sleep(0.5)  # Optional: avoid rate limit

X_embeddings = np.array(X_embeddings)
np.save("openai_embeddings.npy", X_embeddings)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Define base learners
lightgbm_model = LGBMClassifier(
    device='gpu',
    boosting_type='gbdt',
    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance=True,
    class_weight="balanced",
    reg_alpha=0.1,
    reg_lambda=0.1,
    learning_rate=0.08081298097796712,
    n_estimators=367,
    max_depth=20,
    n_jobs=-1,
    verbose=-1
)

logreg_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs',
    multi_class='multinomial',
    n_jobs=-1
)

# Meta learner (XGBoost with GPU)
xgb_meta = XGBClassifier(
    tree_method='gpu_hist',
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# Stacking model
stacking_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('logistic_regression', logreg_model)
    ],
    final_estimator=xgb_meta,
    n_jobs=-1,
    cv=5
)

# MLflow tracking
with mlflow.start_run(run_name="Stacking_with_OpenAI_Embeddings"):
    mlflow.log_param("embedding_model", "text-embedding-3-small")
    mlflow.log_param("lightgbm_learning_rate", 0.0808)
    mlflow.log_param("lightgbm_n_estimators", 367)
    mlflow.log_param("logreg_max_iter", 1000)
    mlflow.log_param("meta_learner", "XGBoost-GPU")

    print("Training Stacking Model...")
    for _ in tqdm(range(1), desc="Fitting model"):
        stacking_model.fit(X_train, y_train)

    y_pred = stacking_model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score_weighted", f1)

    report = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt")

    joblib.dump(stacking_model, "Final_Model.pkl")
    mlflow.log_artifact("Final_Model.pkl")

    mlflow.log_artifact("openai_embeddings.npy")

    cleaned_dataset.to_csv("used_dataset.csv", index=False)
    mlflow.log_artifact("used_dataset.csv")

print(report)

🚀 Generating OpenAI embeddings in batches...




Embedding Batches:   0%|          | 0/367 [00:00<?, ?it/s][A[A

Embedding Batches:   0%|          | 1/367 [00:01<08:57,  1.47s/it][A[A

Embedding Batches:   1%|          | 2/367 [00:02<08:03,  1.33s/it][A[A

Embedding Batches:   1%|          | 3/367 [00:03<07:44,  1.28s/it][A[A

Embedding Batches:   1%|          | 4/367 [00:05<07:44,  1.28s/it][A[A

Embedding Batches:   1%|▏         | 5/367 [00:06<07:58,  1.32s/it][A[A

Embedding Batches:   2%|▏         | 6/367 [00:07<07:38,  1.27s/it][A[A

Embedding Batches:   2%|▏         | 7/367 [00:09<07:43,  1.29s/it][A[A

Embedding Batches:   2%|▏         | 8/367 [00:10<08:00,  1.34s/it][A[A

Embedding Batches:   2%|▏         | 9/367 [00:11<07:38,  1.28s/it][A[A

Embedding Batches:   3%|▎         | 10/367 [00:12<07:32,  1.27s/it][A[A

Embedding Batches:   3%|▎         | 11/367 [00:14<07:53,  1.33s/it][A[A

Embedding Batches:   3%|▎         | 12/367 [00:16<08:23,  1.42s/it][A[A

Embedding Batches:   4%|▎         | 13/36

Training Stacking Model...


Fitting model: 100%|██████████| 1/1 [33:30<00:00, 2010.89s/it]


🏃 View run Stacking_with_OpenAI_Embeddings at: https://dagshub.com/Shrijeet14/Comment-Analyzer.mlflow/#/experiments/8/runs/80bc4db6cdc346a1a34bc1814ed69407
🧪 View experiment at: https://dagshub.com/Shrijeet14/Comment-Analyzer.mlflow/#/experiments/8
              precision    recall  f1-score   support

          -1       0.63      0.52      0.57      1647
           0       0.76      0.81      0.78      2510
           1       0.75      0.77      0.76      3176

    accuracy                           0.73      7333
   macro avg       0.71      0.70      0.70      7333
weighted avg       0.72      0.73      0.72      7333

