In [11]:

import dagshub
dagshub.init(repo_owner='Vaibha3246', repo_name='influence_mirror', mlflow=True)

import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri("https://dagshub.com/Vaibha3246/influence_mirror.mlflow")

In [7]:
import pandas as pd
df=pd.read_csv('preprocessing.csv').dropna(subset=['text_clean'])

In [8]:
df.head(5)

Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,...,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy
0,4wZwXhoxRIA,technology,All products can be found on www.justicebuys.c...,1978,2025-01-04 19:28:08+00:00,product found üôåüèº since review 50+ product per ...,positive,1,24,9,...,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.333333
1,4wZwXhoxRIA,technology,Bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,0,2025-09-10 23:44:24+00:00,bro ‚Äúhow talk woman 6 steps‚Äù relatable,neutral,0,12,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4wZwXhoxRIA,technology,I was gonna say does it give you the drinks fo...,0,2025-09-09 16:25:03+00:00,wa gonna say doe give drink free?üò≠,positive,1,12,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4wZwXhoxRIA,technology,Anyone gonna talk abt what was o. His pc,0,2025-09-08 22:33:58+00:00,anyone gonna talk abt wa o. pc,neutral,0,9,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4wZwXhoxRIA,technology,HOW IS EVERYONE NOT TALKING ABOUT HIS SEARCH?!...,0,2025-09-08 12:17:37+00:00,"everyone not talking search?!""how tell ur girl...",positive,1,15,6,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5


In [5]:
# -------------------------------------------------------------
# Experiment 4: Comparing imbalance handling methods for sentiment prediction
# -------------------------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import scipy.sparse as sp
import numpy as np

In [9]:
df[df['text_clean'].isna()]

Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,...,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy


In [10]:
# -----------------------------
# 1Ô∏è‚É£ Prepare numeric + text data
# -----------------------------
numeric_cols = [col for col in df.columns if col not in [
    'video_id', 'category', 'text', 'text_clean', 'sentiment',
    'dominant_emotion', 'published_at', 'sentiment_numeric'
]]

X_numeric = df[numeric_cols]
y = df['sentiment_numeric']

# Scale numeric features
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# Train-test split (keep same split for fairness)
X_train_num, X_test_num, y_train, y_test, train_idx, test_idx = train_test_split(
    X_numeric_scaled, y, df.index, test_size=0.2, random_state=42, stratify=y
)

df_train_text = df.loc[train_idx, 'text_clean']
df_test_text = df.loc[test_idx, 'text_clean']

# -----------------------------
# 2Ô∏è‚É£ Function: Run an experiment for each imbalance technique
# -----------------------------
def run_imbalanced_experiment(imbalance_method):
    ngram_range = (1, 3)
    max_features = 2000  # best from Experiment 3

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(df_train_text)
    X_test_vec = vectorizer.transform(df_test_text)

    # Combine text + numeric features
    X_train_sparse = sp.hstack([X_train_vec, sp.csr_matrix(X_train_num)])
    X_test_sparse = sp.hstack([X_test_vec, sp.csr_matrix(X_test_num)])

    # -----------------------------
    # Handle imbalance
    # -----------------------------
    class_weight = None
    if imbalance_method == 'class_weights':
        class_weight = 'balanced'
    else:
        if imbalance_method == 'oversampling':
            smote = SMOTE(random_state=42)
            X_train_sparse, y_train_bal = smote.fit_resample(X_train_sparse, y_train)
        elif imbalance_method == 'adasyn':
            adasyn = ADASYN(random_state=42)
            X_train_sparse, y_train_bal = adasyn.fit_resample(X_train_sparse, y_train)
        elif imbalance_method == 'undersampling':
            rus = RandomUnderSampler(random_state=42)
            X_train_sparse, y_train_bal = rus.fit_resample(X_train_sparse, y_train)
        elif imbalance_method == 'smote_enn':
            smote_enn = SMOTEENN(random_state=42)
            X_train_sparse, y_train_bal = smote_enn.fit_resample(X_train_sparse, y_train)
        else:
            y_train_bal = y_train
    if imbalance_method == 'class_weights':
        y_train_bal = y_train  # original labels

    # -----------------------------
    # Train Random Forest
    # -----------------------------
    n_estimators = 100
    max_depth = 15

    with mlflow.start_run() as run:
        run_name = f"Imbalance_{imbalance_method}_TFIDF_2000feat"
        mlflow.set_tag("mlflow.runName", run_name)
        mlflow.set_tag("experiment_type", "imbalance_handling")
        mlflow.set_tag("model_type", "RandomForestClassifier")
        mlflow.set_tag("description", f"TF-IDF (1,3) + numeric features with {imbalance_method}")

        # Log parameters
        mlflow.log_params({
            "vectorizer_type": "TF-IDF",
            "ngram_range": ngram_range,
            "vectorizer_max_features": max_features,
            "imbalance_method": imbalance_method,
            "n_estimators": n_estimators,
            "max_depth": max_depth
        })

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            class_weight=class_weight,
            n_jobs=-1
        )
        model.fit(X_train_sparse, y_train_bal)

        # -----------------------------
        # Evaluation
        # -----------------------------
        y_pred = model.predict(X_test_sparse)
        accuracy = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_macro", f1_macro)

        # Classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Confusion matrix
        labels = ['Negative (-1)', 'Neutral (0)', 'Positive (1)']
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8,6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix ({imbalance_method})")
        fname = f"conf_matrix_{imbalance_method}.png"
        plt.savefig(fname)
        mlflow.log_artifact(fname)
        plt.close()

        # Log model
        mlflow.sklearn.log_model(model, f"random_forest_tfidf_imbalance_{imbalance_method}")

        print(f"‚úÖ {imbalance_method}: Accuracy={accuracy:.3f}, F1={f1_macro:.3f}")


# -----------------------------
# 3Ô∏è‚É£ Run all imbalance methods
# -----------------------------
imbalance_methods = ['class_weights', 'oversampling', 'adasyn', 'undersampling', 'smote_enn']
for method in imbalance_methods:
    run_imbalanced_experiment(method)



‚úÖ class_weights: Accuracy=0.661, F1=0.666
üèÉ View run Imbalance_class_weights_TFIDF_2000feat at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0/runs/3c6a1591c1514d46b81138e304adb155
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0




‚úÖ oversampling: Accuracy=0.663, F1=0.666
üèÉ View run Imbalance_oversampling_TFIDF_2000feat at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0/runs/f57242284886480ea0e9a8c8b9b8efd2
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0




‚úÖ adasyn: Accuracy=0.656, F1=0.655
üèÉ View run Imbalance_adasyn_TFIDF_2000feat at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0/runs/2a1c1c20b6644c889d7191d7cbdd9360
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0




‚úÖ undersampling: Accuracy=0.651, F1=0.650
üèÉ View run Imbalance_undersampling_TFIDF_2000feat at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0/runs/0514dfca577244c5905ac5de7640c744
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0




‚úÖ smote_enn: Accuracy=0.623, F1=0.619
üèÉ View run Imbalance_smote_enn_TFIDF_2000feat at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0/runs/c8f8d9a44309420daac486ad03b9b6b6
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/0
