BaseLine Logisitc Regression

In [59]:
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import os
from dotenv import load_dotenv
import dagshub
import mlflow
import logging
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

In [58]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [53]:
load_dotenv()

REPO_OWNER = os.getenv("REPO_OWNER")
REPO_NAME = os.getenv("REPO_NAME")
dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)

data_path = Path.cwd().parent / "data" / "data.csv"

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [40]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,review,sentiment
0,Bad. Personal opinion? The folks who made it? ...,negative
1,This movie is obviously low-budget & filmed in...,positive
2,"Yes, this movie has kids going to space camp a...",negative
3,"Before I begin, let me tell you how GREAT this...",positive
4,The Vampire Bat is set in the small German vil...,negative


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [42]:
def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

In [43]:
def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

In [None]:
def lower_case(text):
    """Convert text to lower case."""
    return text.lower()

In [45]:
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [46]:
def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('ÿõ', "")
    text = text.replace(';', "")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [47]:
def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [48]:
def normalize_text(df):
    """Normalize the text data."""
    df['review'] = df['review'].apply(lower_case)
    df['review'] = df['review'].apply(remove_stop_words)
    df['review'] = df['review'].apply(removing_numbers)
    df['review'] = df['review'].apply(removing_punctuations)
    df['review'] = df['review'].apply(removing_urls)
    df['review'] = df['review'].apply(lemmatization)
    return df

In [49]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
0,bad personal opinion folk made it knew made it...,negative
1,movie obviously low budget filmed british colu...,positive
2,yes movie kid going space camp start okay enou...,negative
3,begin let tell great stand up special sound pl...,positive
4,vampire bat set small german village klineschl...,negative


In [50]:
sentiment_mapper = {"positive" : 1 , "negative" : 0}
df['sentiment'] = df['sentiment'].map(sentiment_mapper)
df.head()

Unnamed: 0,review,sentiment
0,bad personal opinion folk made it knew made it...,0
1,movie obviously low budget filmed british colu...,1
2,yes movie kid going space camp start okay enou...,0
3,begin let tell great stand up special sound pl...,1
4,vampire bat set small german village klineschl...,0


In [51]:
vectorizer = CountVectorizer(max_features=100)
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
mlflow.set_experiment("Logistic Regression Baseline")

<Experiment: artifact_location='mlflow-artifacts:/b40e91b6bc2e42b4a98ecd7a155d8265', creation_time=1768137081646, experiment_id='0', last_update_time=1768137081646, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [60]:
logging.info("Starting MLflow run...")

with mlflow.start_run():
    start_time = time.time()

    try:
        logging.info("Logging preprocessing parameters...")
        mlflow.log_param("vectorizer", "Bag of Words")
        mlflow.log_param("num_features", 100)
        mlflow.log_param("test_size", 0.2)

        logging.info("Initializing Logistic Regression model...")
        model = LogisticRegression(max_iter=1000) 

        logging.info("Fitting the model...")
        model.fit(X_train, y_train)
        logging.info("Model training complete.")

        logging.info("Logging model parameters...")
        mlflow.log_param("model", "Logistic Regression")

        logging.info("Making predictions...")
        y_pred = model.predict(X_test)

        logging.info("Calculating evaluation metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        logging.info("Logging evaluation metrics...")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        logging.info("Saving and logging the model...")
        mlflow.sklearn.log_model(model, "model")

        # Log execution time
        end_time = time.time()
        logging.info(f"Model training and logging completed in {end_time - start_time:.2f} seconds.")

        # Save and log the notebook
        notebook_path = "exp1.ipynb"
        logging.info("Executing Jupyter Notebook.")
        os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
        mlflow.log_artifact(notebook_path)

        logging.info("Notebook execution and logging complete.")

        # Print the results for verification
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)

2026-01-11 18:48:02,543 - INFO - Starting MLflow run...
2026-01-11 18:48:03,841 - INFO - Logging preprocessing parameters...
2026-01-11 18:48:05,166 - INFO - Initializing Logistic Regression model...
2026-01-11 18:48:05,166 - INFO - Fitting the model...
2026-01-11 18:48:05,178 - INFO - Model training complete.
2026-01-11 18:48:05,178 - INFO - Logging model parameters...
2026-01-11 18:48:05,648 - INFO - Making predictions...
2026-01-11 18:48:05,651 - INFO - Calculating evaluation metrics...
2026-01-11 18:48:05,661 - INFO - Logging evaluation metrics...
2026-01-11 18:48:10,774 - INFO - Saving and logging the model...
2026-01-11 18:48:25,825 - INFO - Model training and logging completed in 21.98 seconds.
2026-01-11 18:48:25,825 - INFO - Executing Jupyter Notebook.
2026-01-11 18:48:45,833 - INFO - Notebook execution and logging complete.
2026-01-11 18:48:45,833 - INFO - Accuracy: 0.655
2026-01-11 18:48:45,833 - INFO - Precision: 0.6699029126213593
2026-01-11 18:48:45,833 - INFO - Recall: 0

üèÉ View run abrasive-gnu-586 at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/0/runs/bc54c98340114943b78efed8955f2ed1
üß™ View experiment at: https://dagshub.com/Shreyaan16/Sentiment-Analysis-IMDB.mlflow/#/experiments/0
