Logistic Regression + Tfidf hyperparameter tuning

In [30]:
import os
import re
import string
import pandas as pd
import mlflow
import logging
import dagshub
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from dotenv import load_dotenv
from pathlib import Path

In [31]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [32]:
load_dotenv()

REPO_OWNER = os.getenv("REPO_OWNER")
REPO_NAME = os.getenv("REPO_NAME")
dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)

data_path = Path.cwd().parent / "data" / "data.csv"

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

2026-01-11 20:54:29,106 - INFO - HTTP Request: GET https://dagshub.com/api/v1/repos/Shreyaan16/Sentiment-Analysis-IMDB "HTTP/1.1 200 OK"


2026-01-11 20:54:29,116 - INFO - Initialized MLflow to track repo "Shreyaan16/Sentiment-Analysis-IMDB"


2026-01-11 20:54:29,119 - INFO - Repository Shreyaan16/Sentiment-Analysis-IMDB initialized!


In [33]:
mlflow.set_experiment("LoR with Tfidf")

2026/01/11 20:54:29 INFO mlflow.tracking.fluent: Experiment with name 'LoR with Tfidf' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/e8b0aa2e604240ec892beb90e0bb83b8', creation_time=1768145071690, experiment_id='2', last_update_time=1768145071690, lifecycle_stage='active', name='LoR with Tfidf', tags={}>

In [34]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,review,sentiment
0,Bad. Personal opinion? The folks who made it? ...,negative
1,This movie is obviously low-budget & filmed in...,positive
2,"Yes, this movie has kids going to space camp a...",negative
3,"Before I begin, let me tell you how GREAT this...",positive
4,The Vampire Bat is set in the small German vil...,negative


In [35]:
def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

In [36]:
def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

In [37]:
def lower_case(text):
    """Convert text to lower case."""
    return text.lower()

In [38]:
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [39]:
def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('ÿõ', "")
    text = text.replace(';', "")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [40]:
def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [41]:
def normalize_text(df):
    """Normalize the text data."""
    df['review'] = df['review'].apply(lower_case)
    df['review'] = df['review'].apply(remove_stop_words)
    df['review'] = df['review'].apply(removing_numbers)
    df['review'] = df['review'].apply(removing_punctuations)
    df['review'] = df['review'].apply(removing_urls)
    df['review'] = df['review'].apply(lemmatization)
    return df

In [42]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
0,bad personal opinion folk made it knew made it...,negative
1,movie obviously low budget filmed british colu...,positive
2,yes movie kid going space camp start okay enou...,negative
3,begin let tell great stand up special sound pl...,positive
4,vampire bat set small german village klineschl...,negative


In [43]:
sentiment_mapper = {"positive" : 1 , "negative" : 0}
df['sentiment'] = df['sentiment'].map(sentiment_mapper)
df.head()

Unnamed: 0,review,sentiment
0,bad personal opinion folk made it knew made it...,0
1,movie obviously low budget filmed british colu...,1
2,yes movie kid going space camp start okay enou...,0
3,begin let tell great stand up special sound pl...,1
4,vampire bat set small german village klineschl...,0


In [44]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["review"])
y = df["sentiment"]

In [45]:
X_train , X_test , y_train , y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
def train_and_log_model(X_train, X_test, y_train, y_test, vectorizer):
    """Trains a Logistic Regression model with GridSearch and logs results to MLflow."""
    
    param_grid = {
        "C": [0.1, 1, 10],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear"]
    }
    
    with mlflow.start_run():
        grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="f1", n_jobs=-1)
        grid_search.fit(X_train, y_train)

        # Log all hyperparameter tuning runs
        for params, mean_score, std_score in zip(grid_search.cv_results_["params"], 
                                                 grid_search.cv_results_["mean_test_score"], 
                                                 grid_search.cv_results_["std_test_score"]):
            with mlflow.start_run(run_name=f"LR with params: {params}", nested=True):
                model = LogisticRegression(**params)
                model.fit(X_train, y_train)
                
                y_pred = model.predict(X_test)
                
                metrics = {
                    "accuracy": accuracy_score(y_test, y_pred),
                    "precision": precision_score(y_test, y_pred),
                    "recall": recall_score(y_test, y_pred),
                    "f1_score": f1_score(y_test, y_pred),
                    "mean_cv_score": mean_score,
                    "std_cv_score": std_score
                }
                
                # Log parameters & metrics
                mlflow.log_params(params)
                mlflow.log_metrics(metrics)
                
                print(f"Params: {params} | Accuracy: {metrics['accuracy']:.4f} | F1: {metrics['f1_score']:.4f}")

        # Log the best model
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        best_f1 = grid_search.best_score_

        mlflow.log_params(best_params)
        mlflow.log_metric("best_f1_score", best_f1)
        mlflow.sklearn.log_model(best_model, "model")
        
        print(f"\nBest Params: {best_params} | Best F1 Score: {best_f1:.4f}")

In [47]:
train_and_log_model(X_train, X_test, y_train, y_test, vectorizer)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.4800 | F1: 0.0000
üèÉ View run LR with params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/2/runs/e8e8b1bb78394b87b8eb866b46e531c2
üß™ View experiment at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/2
Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} | Accuracy: 0.7350 | F1: 0.7072
üèÉ View run LR with params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'} at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/2/runs/d44639b420294d588d4938f02411ef8c
üß™ View experiment at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/2
Params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} | Accuracy: 0.7000 | F1: 0.7345
üèÉ View run LR with params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'} at: https://dagshub.com/Shreyaan16/Sentime




Best Params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'} | Best F1 Score: 0.8106
üèÉ View run bouncy-kit-903 at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/2/runs/fe0542a2a1fc418ea35eae723bf05666
üß™ View experiment at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/2
