In [14]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
import re
import string 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.model_selection import GridSearchCV
import os
import joblib


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/refs/heads/main/tweet_emotions.csv").drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [3]:
def clean_data(text):
  text = text.lower()
  text = text.translate(str.maketrans('','',string.punctuation))
  text = re.sub(r'\d+', '', text)
  text = text.strip()
  return text

def noise_removal(text):
  text = re.sub(r"http\S+|www\S+|https\S+",'',text)
  text = re.sub(r'[^A-Za-z\s]','',text)
  return text

def tokenization(text):
  tokens = word_tokenize(text)
  return tokens

def stopword_removal(tokens):
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  return tokens

def lemmatization(tokens):
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return tokens


def preprocess_text(df):
  df['content'] = df['content'].apply(lambda content: clean_data(content))
  df['content'] = df['content'].apply(lambda content: noise_removal(content))
  df['content'] = df['content'].apply(lambda content:tokenization(content))
  df['content'] = df['content'].apply(lambda content: stopword_removal(content))
  df['content'] = df['content'].apply(lambda content: lemmatization(content))
  df['content'] = df['content'].apply(lambda tokens: ' '.join(tokens))
  return df 
  

In [4]:
df = preprocess_text(df)
df.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue know listenin bad habit earlier sta...
1,sadness,layin n bed headache ughhhhwaitin call
2,sadness,funeral ceremonygloomy friday
3,enthusiasm,want hang friend soon
4,neutral,dannycastillo want trade someone houston ticke...


In [5]:
df['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [6]:
x= df['sentiment'].isin(['happiness','sadness'])
df = df[x]

In [7]:
df['sentiment'] = df['sentiment'].replace({'sadness':0,'happiness':1})
df.head()

  df['sentiment'] = df['sentiment'].replace({'sadness':0,'happiness':1})


Unnamed: 0,sentiment,content
1,0,layin n bed headache ughhhhwaitin call
2,0,funeral ceremonygloomy friday
6,0,sleep im thinking old friend want he married d...
8,0,charviray charlene love miss
9,0,kelcouch im sorry least friday


In [8]:
vectorizer = CountVectorizer(max_features=1000)
x = vectorizer.fit_transform(df['content'])
y = df['sentiment']


In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 42)

In [16]:
import dagshub

dagshub.init(repo_owner='Nite2005', repo_name='mlops-mini-project', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/Nite2005/mlops-mini-project.mlflow")


mlflow.set_experiment("Logistic Regression Hyperparameter Tuning")

param_grid = {
    'C':[0.1,1,10],
    'penalty':['l1','l2'],
    'solver':['liblinear']
}

In [18]:
with mlflow.start_run():
    grid_search = GridSearchCV(
        LogisticRegression(multi_class='ovr', max_iter=1000),
        param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1
    )
    grid_search.fit(x_train, y_train)

    for i, params in enumerate(grid_search.cv_results_['params']):
        mean_score = grid_search.cv_results_['mean_test_score'][i]
        std_score = grid_search.cv_results_['std_test_score'][i]

        with mlflow.start_run(run_name=f"LR with params: {params}", nested=True):
            model = LogisticRegression(**params)
            model.fit(x_train, y_train)

            # Predictions
            y_pred = model.predict(x_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            # Log parameters & metrics
            mlflow.log_params(params)
            mlflow.log_param("vectorizer", "Bag of words")
            mlflow.log_param("num_features", 1000)
            mlflow.log_param("test_size", 0.2)
            mlflow.log_param("model", "Logistic Regression")

            mlflow.log_metric("mean_cv_score", mean_score)
            mlflow.log_metric("std_cv_score", std_score)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)

            # Save and log model
            joblib.dump(model, "model.pkl")
            mlflow.log_artifact("model.pkl")

            # Optionally re-run and log notebook
            notebook_path = 'exp1_baseline_model.ipynb'
            os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
            mlflow.log_artifact(notebook_path)

            print(f"\nParams: {params}")
            print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Log best params and best score from GridSearchCV
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    mlflow.log_params(best_params)
    mlflow.log_metric("best_f1_score", best_score)




Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.7320, Precision: 0.7710, Recall: 0.6433, F1: 0.7014
🏃 View run LR with params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/Nite2005/mlops-mini-project.mlflow/#/experiments/2/runs/4e5a8e09ca1e4de0b44c8834464d00b7
🧪 View experiment at: https://dagshub.com/Nite2005/mlops-mini-project.mlflow/#/experiments/2

Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.7831, Precision: 0.7653, Recall: 0.8030, F1: 0.7837
🏃 View run LR with params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} at: https://dagshub.com/Nite2005/mlops-mini-project.mlflow/#/experiments/2/runs/bdbde910ef62425394595a2f09237b92
🧪 View experiment at: https://dagshub.com/Nite2005/mlops-mini-project.mlflow/#/experiments/2

Params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.7827, Precision: 0.7578, Recall: 0.8167, F1: 0.7862
🏃 View run LR with params: {'C': 1, 'penalty': 'l1', 'solver':