# Baseline Experiment: Logistic Regression

In [12]:
import logging
import time
import re
import string
from pathlib import Path

import nltk
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Configure Logging

In [13]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Define Configuration and Constants

In [None]:
class Config:
    """
    Configuration for data paths and model parameters.
    """
    DATA_PATH = Path("data.csv")
    SAMPLE_SIZE = 500
    TEST_SIZE = 0.25
    RANDOM_STATE = 42
    MAX_FEATURES = 100
    MODEL_MAX_ITER = 1000
    MLFLOW_URI = 'https://dagshub.com/SurajBhar/moviesentiment.mlflow'
    EXPERIMENT_NAME = 'Logistic Regression Baseline'

# Load the dataset

In [15]:
def load_and_sample_data(path: Path, sample_size: int, random_state: int) -> pd.DataFrame:
    """
    Load the dataset from CSV and return a random sample.

    Args:
        path (Path): Path to the CSV file.
        sample_size (int): Number of samples to draw.
        random_state (int): Seed for reproducibility.

    Returns:
        pd.DataFrame: Sampled data.
    """
    df = pd.read_csv(path)
    sampled = df.sample(sample_size, random_state=random_state)
    sampled.to_csv(path, index=False)
    logging.info(f"Loaded and sampled {sample_size} rows.")
    return sampled

In [16]:
df = load_and_sample_data(Config.DATA_PATH, Config.SAMPLE_SIZE, Config.RANDOM_STATE)

2025-07-30 16:19:08,711 - INFO - Loaded and sampled 500 rows.


# The dataset contains 2 main Features
- Review
- Sentiment
- Size of the dataset: 500x2
- This dataset is a subset generated from original IMDB dataset.
- Due to Computation Limits we are going to use the small dataset. 

In [17]:
df

Unnamed: 0,review,sentiment
361,There wasn't a 0 in the voting option so i was...,negative
73,"When I was 11, Grease 2 was like crack. It was...",negative
374,"Some amusing humor, some that falls flat, some...",negative
155,I can say without a shadow of a doubt that Goi...,negative
104,"Attack Force has a horrendous title, and can a...",negative
...,...,...
106,"Now, I like sci-fi cartoons. However, when ""Ro...",negative
270,If you want to be cynical and pedantic you cou...,positive
348,Trapped: buried alive brings us to a resort th...,positive
435,"I recently stumbled across a TV showing of ""Pa...",positive


# Normalization Steps

**1. Lowercasing**

- What it does:
    - Convert all characters in your text to lowercase (e.g., “The Quick Brown Fox” → “the quick brown fox”).

- Why it helps:

    - Vocabulary reduction: “Dog,” “dog,” and “DOG” all become “dog,” so your model learns fewer, more robust word forms.

    - Consistency: Downstream tokenizers and embeddings treat words uniformly, improving statistics (e.g., word counts, TF–IDF).

**2. Remove URLs**
- What it does:
    - Strip out web addresses (e.g., “https://example.com/page”). Typically done via a regex like https?://\S+ or <http\S+>.

- Why it helps:

    - Reduce noise: URLs rarely contain linguistically meaningful content for most tasks (unless you’re doing specialized link analysis).

    - Prevent data leakage: Raw URLs can embed user-specific or sensitive information.

**3. Remove Numbers**
- What it does:
    - Delete or replace digit sequences (e.g., “In 2025, sales hit 1,000” → “In , sales hit ,”). Often achieved with a regex like \d+.

- Why it helps:

    - Normalize quantities: Unless your task needs exact numeric values (e.g., financial forecasting), stripping numbers keeps focus on the textual patterns.

    - Vocabulary control: Prevents proliferation of unique “words” like “2025,” “2024,” etc.

**4. Remove Punctuation**
- What it does:
    - Strip punctuation marks (.,!?;:“”()-—…) from the text.

- Why it helps:

    - Cleaner tokens: Ensures “hello,” “hello!” and “hello?” all map to “hello.”

    - Simplify modeling: Many algorithms treat punctuation as separate tokens, which often adds little semantic value in typical text classification or topic modeling.

**5. Remove Stop Words**
- What it does:
    - Eliminate very frequent, semantically-light words like “the,” “is,” “and,” “of.” Common stop‑word lists include 100–300 words.

- Why it helps:

    - Focus on content words: Drops words that carry little topic or sentiment information.

    - Dimensionality reduction: Fewer tokens → smaller vocabulary → faster training.

**6. Lemmatization**
- What it does:
    - Reduce words to their dictionary (lemma) form, taking context and part-of-speech into account (e.g., “running” → “run,” “better” → “good”).

- Why it helps:

    - Semantic grouping: “run,” “runs,” “running,” “ran” all become “run,” so your model sees them as the same concept.

    - Improved generalization: Helps with sparsity, especially in smaller datasets.



In [18]:
def preprocess_text(text: str, lemmatizer: WordNetLemmatizer, stop_words: set) -> str:
    """
    Apply normalization steps: lowercase, remove URLs, numbers, punctuation, stop words, and lemmatize.

    Args:
        text (str): Original text review.
        lemmatizer (WordNetLemmatizer): NLTK lemmatizer.
        stop_words (set): Set of stop words.

    Returns:
        str: Cleaned text.
    """
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove numbers
    text = ''.join(ch for ch in text if not ch.isdigit())
    # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
    # Tokenize & remove stop words
    tokens = [w for w in text.split() if w not in stop_words]
    # Lemmatize
    lemmed = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(lemmed)

# Normalize the review column

In [19]:
def normalize_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize the 'review' column in the dataset.

    Args:
        df (pd.DataFrame): DataFrame with a 'review' column.

    Returns:
        pd.DataFrame: DataFrame with cleaned reviews.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    df['review'] = df['review'].apply(lambda txt: preprocess_text(txt, lemmatizer, stop_words))
    logging.info("Completed text normalization.")
    return df

In [20]:
# Ensure necessary NLTK resources are available
for pkg in ['stopwords', 'wordnet', 'omw-1.4']:
    try:
        nltk.data.find(f'corpora/{pkg}')
    except LookupError:
        nltk.download(pkg, quiet=True)

In [21]:
# Step 2: Normalize text
df = normalize_dataset(df)

2025-07-30 16:19:42,748 - INFO - Completed text normalization.


In [23]:
df

Unnamed: 0,review,sentiment
361,voting option compelled use next available fig...,negative
73,grease like crack classless shameful euphoric ...,negative
374,amusing humor fall flat decent acting quite at...,negative
155,say without shadow doubt going overboard singl...,negative
104,attack force horrendous title almost certainly...,negative
...,...,...
106,like sci fi cartoon however robotboy appeared ...,negative
270,want cynical pedantic could point opening raf ...,positive
348,trapped buried alive brings u resort opened so...,positive
435,recently stumbled across tv showing passion mi...,positive


# Transform text data into features
- Maps the Sentiments into (positive:1, negative:0)

- We are using scikit‑learn’s `CountVectorizer` to turn a column of raw text (“reviews”) into a numeric feature matrix. Let’s break it down:

1. **`CountVectorizer(max_features=max_features)`**

   * **What it is:** A transformer that converts a collection of text documents into a “bag‑of‑words” count matrix.
   * **`max_features` parameter:** Limits the vocabulary to the top `max_features` most frequent tokens across your entire corpus. This helps control dimensionality by discarding all but the most common words.

2. **`.fit_transform(df['review'])`**

   * **`fit`:**

     * Learns the vocabulary dictionary: it scans all the reviews, tokenizes them (by default on whitespace and punctuation), counts word frequencies, and selects the top `max_features` words to keep.
   * **`transform`:**

     * Converts each review into a vector of word counts, where each dimension corresponds to one of the selected vocabulary words.
   * **Combining them (`fit_transform`):**

     * Efficiently does both steps in one go: first it “fits” to build the vocabulary, then immediately “transforms” the input text into the count matrix.

3. **Result `X`**

   * A **sparse matrix** of shape `(n_samples, max_features)`.

     * **`n_samples`** = number of rows in `df['review']`.
     * **`max_features`** = number of columns (the size of your truncated vocabulary).
   * Each row is a document (one review); each column is a token (one of the top `max_features` words).
   * The entries are integer counts: how many times each word appears in that review.

---

### Why use this?

* **Numerical input for ML models:** Most classical algorithms (logistic regression, SVMs, random forests, etc.) expect numeric arrays, not raw text.
* **Dimensionality control:** By capping at `max_features`, you avoid extremely high‑dimensional matrices that blow up memory and overfit on rare words.
* **Baseline representation:** Bag‑of‑words is a simple yet surprisingly effective way to capture word‑frequency information as features.

---

### Example

Suppose:

```python
df['review'] = [
    "I loved the movie, it was fantastic!",
    "Terrible acting and boring plot.",
    "Fantastic visuals but boring story."
]
max_features = 5
```

After fitting, `CountVectorizer` might pick these top 5 words:

```
['boring', 'fantastic', 'movie', 'story', 'terrible']
```

Then `X.toarray()` would look something like:

```python
array([
  [0, 1, 1, 0, 0],   # "loved the movie fantastic" → counts: boring=0, fantastic=1, movie=1, ...
  [1, 0, 0, 0, 1],   # "terrible acting boring plot" → boring=1, terrible=1
  [1, 1, 0, 1, 0],   # "fantastic visuals boring story" → boring=1, fantastic=1, story=1
])
```

Each row is a review, each column a word count for one of the top features.


In [39]:
def prepare_features(df: pd.DataFrame, max_features: int):
    """
    Transform text data into features and split into train/test sets.

    Returns:
        X_train, X_test, y_train, y_test, vectorizer
    """
    df = df[df['sentiment'].isin(['positive', 'negative'])].copy()
    df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

    vectorizer = CountVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(df['review'])
    y = df['sentiment'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=Config.TEST_SIZE, random_state=Config.RANDOM_STATE
    )
    logging.info("Prepared train/test splits.")
    return X_train, X_test, y_train, y_test, vectorizer

In [41]:
# Step 3: Prepare features
X_train, X_test, y_train, y_test, vectorizer = prepare_features(df, Config.MAX_FEATURES)

2025-07-30 16:39:52,268 - INFO - Prepared train/test splits.


In [42]:
print(f"The shape of X-train: {X_train.shape}")
print(f"The shape of X-test: {X_test.shape}")
print(f"The shape of y-train: {y_train.shape}")
print(f"The shape of y-test: {y_test.shape}")

The shape of X-train: (375, 100)
The shape of X-test: (125, 100)
The shape of y-train: (375,)
The shape of y-test: (125,)


In [43]:
print(X_train[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (1, 100)>
  Coords	Values
  (0, 12)	2
  (0, 62)	1
  (0, 28)	1
  (0, 44)	1
  (0, 77)	1
  (0, 26)	1
  (0, 31)	1
  (0, 19)	1


# Inspect the features and transformed data

In [44]:
# 1. Grab the feature names from your vectorizer
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['acting' 'actor' 'actually' 'almost' 'also' 'another' 'around' 'back'
 'bad' 'best' 'better' 'book' 'br' 'character' 'come' 'comedy' 'could'
 'day' 'director' 'done' 'end' 'even' 'ever' 'every' 'fact' 'family' 'fan'
 'feel' 'film' 'find' 'first' 'get' 'give' 'go' 'going' 'good' 'great'
 'guy' 'horror' 'interesting' 'know' 'life' 'like' 'line' 'little' 'long'
 'look' 'looking' 'lot' 'love' 'made' 'make' 'man' 'many' 'minute' 'movie'
 'much' 'must' 'never' 'new' 'nothing' 'old' 'one' 'part' 'people'
 'performance' 'play' 'plot' 'pretty' 'quite' 'real' 'really' 'role' 'say'
 'scene' 'see' 'seems' 'seen' 'show' 'something' 'star' 'still' 'story'
 'take' 'thing' 'think' 'though' 'time' 'two' 'want' 'watch' 'watching'
 'way' 'well' 'whole' 'woman' 'work' 'world' 'would' 'year']


In [45]:
# 2. Turn the first N rows of X_train into a dense array + DataFrame
N = 5
X_sample = X_train[:N].toarray()                # shape (N, num_features)
df_X_sample = pd.DataFrame(X_sample, columns=feature_names)

In [46]:
# 3. Attach the matching y_train labels
df_X_sample['sentiment'] = y_train[:N]

In [47]:
# 4. Display
print(f"Showing first {N} training samples (bag-of-words counts + label):")
display(df_X_sample)

Showing first 5 training samples (bag-of-words counts + label):


Unnamed: 0,acting,actor,actually,almost,also,another,around,back,bad,best,...,watching,way,well,whole,woman,work,world,would,year,sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,1,0,3,1,1,0,1,0,1,0,...,1,0,1,0,1,0,0,0,0,1
3,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
import dagshub
mlflow.set_tracking_uri(Config.MLFLOW_URI)
dagshub.init(repo_owner='SurajBhar', repo_name='moviesentiment', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=7a4ffb2f-e1fd-4606-b0eb-abd31a264241&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=1500b982910b420330bc1887c8d4a4ca1b7a8c3a888103ffddc3727896e41df6




2025-07-30 16:50:41,437 - INFO - Accessing as SurajBhar


2025-07-30 16:50:41,750 - INFO - Initialized MLflow to track repo "SurajBhar/moviesentiment"


2025-07-30 16:50:41,755 - INFO - Repository SurajBhar/moviesentiment initialized!


In [52]:
def train_and_log_model(X_train, y_train, X_test, y_test):  # noqa: C901
    """
    Train a Logistic Regression model, evaluate it, and log parameters/metrics with MLflow.

    Args:
        X_train: Training features.
        y_train: Training labels.
        X_test: Test features.
        y_test: Test labels.
    """
    # Configure MLflow
    mlflow.set_tracking_uri(Config.MLFLOW_URI)
    mlflow.set_experiment(Config.EXPERIMENT_NAME)

    with mlflow.start_run():
        start_time = time.time()

        # Log feature extraction parameters
        mlflow.log_param("vectorizer", "CountVectorizer")
        mlflow.log_param("max_features", Config.MAX_FEATURES)
        mlflow.log_param("test_size", Config.TEST_SIZE)

        # Initialize and train model
        model = LogisticRegression(max_iter=Config.MODEL_MAX_ITER)
        model.fit(X_train, y_train)
        mlflow.log_param("model_type", "LogisticRegression")

        # Predict and evaluate
        y_pred = model.predict(X_test)
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
        }
        for name, val in metrics.items():
            mlflow.log_metric(name, val)
        logging.info(f"Evaluation metrics: {metrics}")

        # Log model
        mlflow.sklearn.log_model(model, "model")

        elapsed = time.time() - start_time
        logging.info(f"Run completed in {elapsed:.2f}s")

In [53]:
# Step 4: Train and log model
train_and_log_model(X_train, y_train, X_test, y_test)

2025/07/30 16:51:20 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Baseline' does not exist. Creating a new experiment.
2025-07-30 16:51:23,473 - INFO - Evaluation metrics: {'accuracy': 0.72, 'precision': 0.7090909090909091, 'recall': 0.6724137931034483, 'f1_score': 0.6902654867256637}
2025-07-30 16:51:33,629 - INFO - Run completed in 11.89s
2025/07/30 16:51:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run silent-stork-949 at: https://dagshub.com/SurajBhar/moviesentiment.mlflow/#/experiments/0/runs/5dc250f01505466ca4489e3fa4b0d85c.
2025/07/30 16:51:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/SurajBhar/moviesentiment.mlflow/#/experiments/0.
