In [1]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [6]:
def remove_punctuation(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", "", text)

In [7]:
def lowercase(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)

In [8]:
def remove_stopwords(text):
    stopwords_list = set(stopwords.words('english'))
    for word in text.split():
        if word.lower() in stopwords_list:
            text = text.replace(word, "")
    return text

In [9]:
def remove_nums(text):
    for char in text:
        if char.isdigit():
            text = text.replace(char, "")
    return text

In [10]:
def lematize(text):
    lemmatizer = WordNetLemmatizer()
    for word in text.split():
        text = text.replace(word, lemmatizer.lemmatize(word))
    return text

In [11]:
def remove_urls(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

In [12]:
def preprocess_text(text):
    text = remove_punctuation(text)
    text = lowercase(text)
    text = remove_stopwords(text)
    text = remove_nums(text)
    text = lematize(text)
    text = remove_urls(text)
    return text

In [13]:
def normalize_text(df):
    try:
        df['content'] = df['content'].astype(str)
        df['content'] = df['content'].apply(preprocess_text)
        return df
    except Exception as e:
        print(f"Error in normalize_text: {e}")
        raise
    

In [25]:
df.sentiment.value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [30]:
df.shape

(40000, 2)

In [40]:
df = df[df['sentiment'].isin(['happiness', 'sadness'])]
df = normalize_text(df)

In [41]:
df.sentiment = df.sentiment.map({
    # 'neutral': 0,
    # 'worry': 1,
    'sadness': 0,
    'happiness':1
})
df.head()

Unnamed: 0,sentiment,content
1,0,lyin n bed hedche ughhhhwitin cll
2,0,funeral ceremonygloomy friday
6,0,sleep m thnkng old frend wt s marred...
8,0,charvray charlene love ms
9,0,kelcouch im sorry least friday


In [42]:
df.shape

(10374, 2)

In [43]:
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['content'])
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
import dagshub
dagshub.init(repo_owner='Sudip-8345', repo_name='DVC-git-mini-Project', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/Sudip-8345/DVC-git-mini-Project.mlflow')

mlflow.set_experiment('tweet_emotion_classification using baseline Logistic Regression ')




2025/06/15 01:02:23 INFO mlflow.tracking.fluent: Experiment with name 'tweet_emotion_classification using baseline Logistic Regression ' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/38951ea8ed414071976af1563898ea9c', creation_time=1749929543959, experiment_id='0', last_update_time=1749929543959, lifecycle_stage='active', name='tweet_emotion_classification using baseline Logistic Regression ', tags={}>

In [None]:
import os
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import joblib

os.environ["MLFLOW_TRACKING_USERNAME"] = "Sudip-8345"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "3a7b1bd52c535c004bd9b275516eae784255615d"
mlflow.set_tracking_uri('https://dagshub.com/Sudip-8345/DVC-git-mini-Project.mlflow')

# MLflow tracking
with mlflow.start_run():
    # Log parameters
    mlflow.log_param('vectorizer','bag of words')
    mlflow.log_param('model', 'Logistic Regression')
    mlflow.log_param('max_features', 1000)
    mlflow.log_param('test_size', 0.2)

    # Train model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='binary', zero_division=0)
    precision = precision_score(y_test, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)

    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"Precision: {precision}")
    print(f"F1 Score: {f1}")
    
    # Log metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('f1_score', f1)

    # Save and log model
    model_dir = "exp1_baseline_model"
    import joblib
    joblib.dump(model, "model/logreg_model.pkl")
    mlflow.log_artifact("model/logreg_model.pkl", artifact_path="model")

    # Save metrics to file
    os.makedirs("model", exist_ok=True)
    with open('model/metrics.txt', 'w') as f:
        f.write(f'Accuracy: {accuracy}\n')
        f.write(f'Recall: {recall}\n')
        f.write(f'Precision: {precision}\n')
        f.write(f'F1 Score: {f1}\n')

    # Save and log vectorizer (if using CountVectorizer or similar)
    # joblib.dump(vectorizer, 'model/bag_of_words_lr.pkl')
    if os.path.exists('model/bag_of_words_lr.pkl'):
        mlflow.log_artifact('model/bag_of_words_lr.pkl', artifact_path='model')

    # Log metrics.txt
    mlflow.log_artifact('model/metrics.txt', artifact_path='model')

    print(f"Model saved in run {mlflow.active_run().info.run_id}")


Model saved in run 14affec8e0414460aad36cdded38d237
🏃 View run amusing-squirrel-178 at: https://dagshub.com/Sudip-8345/DVC-git-mini-Project.mlflow/#/experiments/0/runs/14affec8e0414460aad36cdded38d237
🧪 View experiment at: https://dagshub.com/Sudip-8345/DVC-git-mini-Project.mlflow/#/experiments/0
