In [1]:

import dagshub
dagshub.init(repo_owner='Vaibha3246', repo_name='influence_mirror', mlflow=True)

import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri("https://dagshub.com/Vaibha3246/influence_mirror.mlflow")

In [2]:
# Set or create an experiment
mlflow.set_experiment("Exp 2 - BoW vs TfIdf")

<Experiment: artifact_location='mlflow-artifacts:/732fc90643ac4e13b5fe973b9adae3f9', creation_time=1758873744125, experiment_id='1', last_update_time=1758873744125, lifecycle_stage='active', name='Exp 2 - BoW vs TfIdf', tags={}>

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

In [4]:
df=pd.read_csv('reddit_preprocessing.csv')

In [5]:
df.head()

Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,...,weekday,month,category_gaming,category_movies,category_music,category_technology,sent_neg,sent_neu,sent_pos,sent_compound
0,4wZwXhoxRIA,technology,All products can be found on www.justicebuys.c...,1978,2025-01-04 19:28:08+00:00,all product can be found on üôåüèº since i review ...,positive,1,24,9,...,5,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,4wZwXhoxRIA,technology,Bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,0,2025-09-10 23:44:24+00:00,bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,neutral,0,12,5,...,2,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,4wZwXhoxRIA,technology,I was gonna say does it give you the drinks fo...,0,2025-09-09 16:25:03+00:00,i wa gonna say doe it give you the drink for f...,positive,1,12,7,...,1,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,4wZwXhoxRIA,technology,Anyone gonna talk abt what was o. His pc,0,2025-09-08 22:33:58+00:00,anyone gonna talk abt what wa o. his pc,neutral,0,9,3,...,0,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,4wZwXhoxRIA,technology,HOW IS EVERYONE NOT TALKING ABOUT HIS SEARCH?!...,0,2025-09-08 12:17:37+00:00,how is everyone not talking about his search?!...,positive,1,15,6,...,0,9,0.0,0.0,0.0,1.0,0.0,0.743,0.257,0.6696


In [6]:
df['text_clean'].isna().sum()

np.int64(0)

In [7]:
#   Add Emotion Features (NRCLex)

!pip install nrclex





[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
from nrclex import NRCLex

def get_dominant_emotion(text):
    emotions = NRCLex(text).top_emotions
    return emotions[0][0] if emotions else "neutral"

df['dominant_emotion'] = df['text_clean'].apply(get_dominant_emotion)


In [9]:
df.head()

Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,...,month,category_gaming,category_movies,category_music,category_technology,sent_neg,sent_neu,sent_pos,sent_compound,dominant_emotion
0,4wZwXhoxRIA,technology,All products can be found on www.justicebuys.c...,1978,2025-01-04 19:28:08+00:00,all product can be found on üôåüèº since i review ...,positive,1,24,9,...,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,trust
1,4wZwXhoxRIA,technology,Bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,0,2025-09-10 23:44:24+00:00,bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,neutral,0,12,5,...,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,positive
2,4wZwXhoxRIA,technology,I was gonna say does it give you the drinks fo...,0,2025-09-09 16:25:03+00:00,i wa gonna say doe it give you the drink for f...,positive,1,12,7,...,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,fear
3,4wZwXhoxRIA,technology,Anyone gonna talk abt what was o. His pc,0,2025-09-08 22:33:58+00:00,anyone gonna talk abt what wa o. his pc,neutral,0,9,3,...,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,positive
4,4wZwXhoxRIA,technology,HOW IS EVERYONE NOT TALKING ABOUT HIS SEARCH?!...,0,2025-09-08 12:17:37+00:00,how is everyone not talking about his search?!...,positive,1,15,6,...,9,0.0,0.0,0.0,1.0,0.0,0.743,0.257,0.6696,positive


In [10]:
from nrclex import NRCLex

def get_emotion_scores(text):
    """
    Returns a dictionary of emotion probabilities for the text
    Example: {"anger": 0.1, "joy": 0.3, ...}
    """
    emotions = NRCLex(text)
    raw_scores = emotions.raw_emotion_scores  # counts of emotion words
    total = sum(raw_scores.values())
    
    if total == 0:  # no emotions found
        return {emotion: 0 for emotion in [
            'fear', 'anger', 'anticipation', 'trust', 
            'surprise', 'positive', 'negative', 
            'sadness', 'disgust', 'joy'
        ]}
    
    # Normalize counts into probabilities
    probs = {emotion: count/total for emotion, count in raw_scores.items()}
    
    # Ensure all emotions appear (fill missing with 0)
    all_emotions = {
        'fear': 0, 'anger': 0, 'anticipation': 0, 'trust': 0, 
        'surprise': 0, 'positive': 0, 'negative': 0, 
        'sadness': 0, 'disgust': 0, 'joy': 0
    }
    all_emotions.update(probs)
    
    return all_emotions


In [11]:
# Apply to your text column
emotion_features = df['text_clean'].apply(get_emotion_scores)

# Convert list of dicts into DataFrame
emotion_df = pd.DataFrame(list(emotion_features))

# Merge with original dataframe
df = pd.concat([df, emotion_df], axis=1)

df.head()


Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,...,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy
0,4wZwXhoxRIA,technology,All products can be found on www.justicebuys.c...,1978,2025-01-04 19:28:08+00:00,all product can be found on üôåüèº since i review ...,positive,1,24,9,...,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.333333
1,4wZwXhoxRIA,technology,Bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,0,2025-09-10 23:44:24+00:00,bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,neutral,0,12,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4wZwXhoxRIA,technology,I was gonna say does it give you the drinks fo...,0,2025-09-09 16:25:03+00:00,i wa gonna say doe it give you the drink for f...,positive,1,12,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4wZwXhoxRIA,technology,Anyone gonna talk abt what was o. His pc,0,2025-09-08 22:33:58+00:00,anyone gonna talk abt what wa o. his pc,neutral,0,9,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4wZwXhoxRIA,technology,HOW IS EVERYONE NOT TALKING ABOUT HIS SEARCH?!...,0,2025-09-08 12:17:37+00:00,how is everyone not talking about his search?!...,positive,1,15,6,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49533 entries, 0 to 49532
Data columns (total 34 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   video_id               49533 non-null  object 
 1   category               49533 non-null  object 
 2   text                   49533 non-null  object 
 3   likes                  49533 non-null  int64  
 4   published_at           49533 non-null  object 
 5   text_clean             49533 non-null  object 
 6   sentiment              49533 non-null  object 
 7   sentiment_numeric      49533 non-null  int64  
 8   word_count             49533 non-null  int64  
 9   num_stop_words         49533 non-null  int64  
 10  num_punctuation_chars  49533 non-null  int64  
 11  num_chars              49533 non-null  int64  
 12  hour                   49533 non-null  int64  
 13  weekday                49533 non-null  int64  
 14  month                  49533 non-null  int64  
 15  ca

In [13]:
from nltk.corpus import stopwords

# Defining stop words but keeping essential ones for sentiment analysis
stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}

# Remove stop words from 'clean_comment' column, retaining essential ones
df['text_clean'] = df['text_clean'].apply(
    lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])
)




In [14]:
# Features
X = df.drop(columns=[
    'video_id', 'category', 'text', 'text_clean',
    'published_at', 'sentiment', 'dominant_emotion','sentiment_numeric'
])

# Target
y = df['sentiment_numeric']

print(" Features shape:", X.shape)
print(" Target shape:", y.shape)
print(" Unique target values:", y.unique())


 Features shape: (49533, 26)
 Target shape: (49533,)
 Unique target values: [ 1  0 -1]


In [15]:
df.head()

Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,...,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy
0,4wZwXhoxRIA,technology,All products can be found on www.justicebuys.c...,1978,2025-01-04 19:28:08+00:00,product found üôåüèº since review 50+ product per ...,positive,1,24,9,...,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.333333
1,4wZwXhoxRIA,technology,Bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,0,2025-09-10 23:44:24+00:00,bro ‚Äúhow talk woman 6 steps‚Äù relatable,neutral,0,12,5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4wZwXhoxRIA,technology,I was gonna say does it give you the drinks fo...,0,2025-09-09 16:25:03+00:00,wa gonna say doe give drink free?üò≠,positive,1,12,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4wZwXhoxRIA,technology,Anyone gonna talk abt what was o. His pc,0,2025-09-08 22:33:58+00:00,anyone gonna talk abt wa o. pc,neutral,0,9,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4wZwXhoxRIA,technology,HOW IS EVERYONE NOT TALKING ABOUT HIS SEARCH?!...,0,2025-09-08 12:17:37+00:00,"everyone not talking search?!""how tell ur girl...",positive,1,15,6,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5


In [16]:
X.dtypes


likes                      int64
word_count                 int64
num_stop_words             int64
num_punctuation_chars      int64
num_chars                  int64
hour                       int64
weekday                    int64
month                      int64
category_gaming          float64
category_movies          float64
category_music           float64
category_technology      float64
sent_neg                 float64
sent_neu                 float64
sent_pos                 float64
sent_compound            float64
fear                     float64
anger                    float64
anticipation             float64
trust                    float64
surprise                 float64
positive                 float64
negative                 float64
sadness                  float64
disgust                  float64
joy                      float64
dtype: object

In [17]:
X.isnull().sum().sum()


np.int64(0)

In [18]:
pd.set_option('display.max_columns', None)


In [19]:
df.head()

Unnamed: 0,video_id,category,text,likes,published_at,text_clean,sentiment,sentiment_numeric,word_count,num_stop_words,num_punctuation_chars,num_chars,hour,weekday,month,category_gaming,category_movies,category_music,category_technology,sent_neg,sent_neu,sent_pos,sent_compound,dominant_emotion,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy
0,4wZwXhoxRIA,technology,All products can be found on www.justicebuys.c...,1978,2025-01-04 19:28:08+00:00,product found üôåüèº since review 50+ product per ...,positive,1,24,9,1,116,19,5,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,trust,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.333333
1,4wZwXhoxRIA,technology,Bro ‚Äúhow to talk to woman in 6 steps‚Äù is so re...,0,2025-09-10 23:44:24+00:00,bro ‚Äúhow talk woman 6 steps‚Äù relatable,neutral,0,12,5,0,53,23,2,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,positive,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4wZwXhoxRIA,technology,I was gonna say does it give you the drinks fo...,0,2025-09-09 16:25:03+00:00,wa gonna say doe give drink free?üò≠,positive,1,12,7,1,54,16,1,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,fear,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4wZwXhoxRIA,technology,Anyone gonna talk abt what was o. His pc,0,2025-09-08 22:33:58+00:00,anyone gonna talk abt wa o. pc,neutral,0,9,3,1,40,22,0,9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,positive,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4wZwXhoxRIA,technology,HOW IS EVERYONE NOT TALKING ABOUT HIS SEARCH?!...,0,2025-09-08 12:17:37+00:00,"everyone not talking search?!""how tell ur girl...",positive,1,15,6,4,85,12,0,9,0.0,0.0,0.0,1.0,0.0,0.743,0.257,0.6696,positive,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5


In [20]:
print("X_train shape:", X.shape)
print("y_train shape:", y.shape)


X_train shape: (49533, 26)
y_train shape: (49533,)


In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
import scipy.sparse as sp
import numpy as np

# -----------------------------
# 1Ô∏è‚É£ Define numeric features
# -----------------------------
numeric_cols = [col for col in df.columns if col not in [
    'video_id', 'category', 'text', 'text_clean', 'sentiment', 
    'dominant_emotion', 'published_at', 'sentiment_numeric'
]]

X_numeric = df[numeric_cols]
y = df['sentiment_numeric']

# Scale numeric features
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

# -----------------------------
# 2Ô∏è‚É£ Train-test split (once!)
# -----------------------------
X_train_num, X_test_num, y_train, y_test, train_idx, test_idx = train_test_split(
    X_numeric_scaled, y, df.index, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 3Ô∏è‚É£ Include text features for the same train/test indices
# -----------------------------
df_train_text = df.loc[train_idx, 'text_clean']
df_test_text = df.loc[test_idx, 'text_clean']

# -----------------------------
# 4Ô∏è‚É£ Run experiment function
# -----------------------------
best_accuracy = 0
best_run_info = {}

def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features, n_estimators=200, max_depth=15):
    global best_accuracy, best_run_info

    # Vectorization
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    # Fit and transform text
    X_train_vec = vectorizer.fit_transform(df_train_text)
    X_test_vec = vectorizer.transform(df_test_text)

    # Convert numeric features to sparse and combine
    X_train_sparse = sp.hstack([X_train_vec, sp.csr_matrix(X_train_num)])
    X_test_sparse = sp.hstack([X_test_vec, sp.csr_matrix(X_test_num)])

    # -----------------------------
    # MLflow experiment
    # -----------------------------
    with mlflow.start_run() as run:
        run_name = f"{vectorizer_type}_{ngram_range}_{n_estimators}trees"
        mlflow.set_tag("mlflow.runName", run_name)
        mlflow.set_tag("experiment_type", "feature_engineering")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Log params
        mlflow.log_params({
            "vectorizer_type": vectorizer_type,
            "ngram_range": ngram_range,
            "vectorizer_max_features": vectorizer_max_features,
            "numeric_features_count": X_train_num.shape[1],
            "n_estimators": n_estimators,
            "max_depth": max_depth
        })

        # Train model
        model = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,          
        min_samples_leaf=2,      # default
        max_features='sqrt',     # sqrt of total features works well for text
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
        )

        model.fit(X_train_sparse, y_train)

        # Predictions & metrics
        y_pred = model.predict(X_test_sparse)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification metrics
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8,6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: {vectorizer_type}, {ngram_range}")
        plt.savefig("confusion_matrix.png")
        mlflow.log_artifact("confusion_matrix.png")
        plt.close()

        # Log model
        mlflow.sklearn.log_model(model, f"random_forest_model_{vectorizer_type}_{ngram_range}_{n_estimators}trees")

        # Update best run
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_run_info = {
                "vectorizer": vectorizer_type,
                "ngram_range": ngram_range,
                "n_estimators": n_estimators,
                "max_depth": max_depth,
                "accuracy": accuracy,
                "run_id": run.info.run_id
            }

# -----------------------------
# 5Ô∏è‚É£ Run all experiments
# -----------------------------
ngram_ranges = [(1,1), (1,2), (1,3)]
vectorizers = ["BoW", "TF-IDF"]
n_estimators_list = [100, 200, 300]

for vectorizer_type in vectorizers:
    for ngram in ngram_ranges:
        for n_est in n_estimators_list:
            run_experiment(vectorizer_type, ngram, vectorizer_max_features=5000, n_estimators=n_est)


print("‚úÖ Best Run Info:")
print(best_run_info)




üèÉ View run BoW_(1, 1)_100trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/c4f64fa7cfce41ad968117eb92476569
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 1)_200trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/38d44835358849b4b963a3cb32fe12c4
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 1)_300trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/f4b501d2594f45daac475fbf0d65146f
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 2)_100trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/e191539b2bbd4e8c8c6844a5a70536ae
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 2)_200trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/124b1054cae74225b06c42e22c3277e9
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 2)_300trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/b1e3faa3f01c4e2c8b391af689f298a2
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 3)_100trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/5f2161e42cbb414b8a0a7cc3885110a2
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 3)_200trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/4651d9ab69f145df9e2108eeb02ed530
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run BoW_(1, 3)_300trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/1770d649661f41009a3ea21b4a791393
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 1)_100trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/2ef9f91f46be4eda80eafed04d6c850f
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 1)_200trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/54991314cf17494fa3c227c36ce040f6
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 1)_300trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/9d5dd9f316d443ab8247363c421347d3
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 2)_100trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/2ec38487e4c8422abb58dad20907050f
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 2)_200trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/5b62b54481524b42805d3226bb749761
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 2)_300trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/0d290677ae7b435bbb231eabfced25a5
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 3)_100trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/d756bf31e6ed42f6996458888a6e0b23
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 3)_200trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/09928658ceac49e4984ccd6e284f5948
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1




üèÉ View run TF-IDF_(1, 3)_300trees at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1/runs/725e7ce28c6348b28b6e2c313b49c237
üß™ View experiment at: https://dagshub.com/Vaibha3246/influence_mirror.mlflow/#/experiments/1
‚úÖ Best Run Info:
{'vectorizer': 'TF-IDF', 'ngram_range': (1, 3), 'n_estimators': 100, 'max_depth': 15, 'accuracy': 0.6551932976683154, 'run_id': 'd756bf31e6ed42f6996458888a6e0b23'}


In [None]:
print(classification_report(y_test, y_pred))

NameError: name 'y_pred' is not defined

In [25]:
df.to_csv('preprocessing.csv', index=False)

In [27]:
df.shape

(49533, 34)