## IMDb Sentiment Analysis

In [1]:
# ONE TIME SETUP SCRIPT
# Obtain the dataset, unzip from dataset folder in local

# import tarfile
# with tarfile.open('dataset/aclImdb_v1.tar.gz', 'r:gz') as tar:
#     tar.extractall()

In [2]:
# Preprocess dataset into Pandas DataFrame
import pyprind
import pandas as pd
import os
import sys

basepath = 'dataset/aclImdb'

# labels = {'pos': 1, 'neg': 0} # Binary classification
# pbar = pyprind.ProgBar(50000, stream=sys.stdout) # Total number of reviews
# df = pd.DataFrame()
# for s in ('train', 'test'): # Iterate through train and test sets
#     for l in ('pos', 'neg'): # Iterate through train and test sets, positive and negative labels
#         path = os.path.join(basepath, s, l) # Path to the reviews
#         for file in sorted(os.listdir(path)): # Iterate through files in the path
#             with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
#                 txt = infile.read() # Read the review text
#             df = df.append([[txt, labels[l]]], ignore_index=True) # Append to DataFrame
#             pbar.update()
# df.columns = ['review', 'sentiment']

# Label mapping: positive review = 1, negative review = 0
labels = {'pos': 1, 'neg': 0}

# Progress bar for 50,000 total reviews
pbar = pyprind.ProgBar(50000, stream=sys.stdout)

data = []  # List to hold tuples of (review text, sentiment label)

# Loop through both 'test' and 'train' datasets
for s in ('test', 'train'):
    # Loop through 'pos' and 'neg' subfolders
    for l in ('pos', 'neg'):
        # Path to the specific sentiment folder
        path = os.path.join(basepath, s, l)
        
        # Loop through all review files in sorted order
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()  # Read review text
            
            # Append the review text and label to DataFrame
            # df = df.append([[txt, labels[l]]], ignore_index=True) # Depreciated append
            # Instead of appending, we collect data in a list for efficiency
            data.append((txt, labels[l]))
            
            # Update progress bar
            pbar.update()

# Create DataFrame once at the end
df = pd.DataFrame(data, columns=['review', 'sentiment'])

# Name the columns: review text and sentiment label
df.columns = ['review', 'sentiment']

In [3]:
import numpy as np
# Shuffle the DataFrame rows for randomness to split the dataset into train and test sets
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8') # VFor conveenience, save to CSV

In [4]:
import pandas as pd
# Make sure formatting is correct


# Load the movie review dataset from CSV
df = pd.read_csv('movie_data.csv', encoding='utf-8')

# Rename columns in case they were saved as "0" and "1" instead of proper names
df = df.rename(columns={"0": "review", "1": "sentiment"})

# Display the first 3 rows
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [5]:
# check that dataframe contains all 50000 reviews:
df.shape

(50000, 2)

### Bag of Words Model

In [6]:
# To construct a bag-of-words model, can use count vectorizer from sklearn, which takes array of text data and constructs a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
# Create a CountVectorizer instance
count = CountVectorizer()
# example text data
text = ['The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet']
# Fit and transform the text data to create a bag-of-words model
bag_of_words = count.fit_transform(text)

In [7]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [8]:
print(bag_of_words.toarray()) # This is also known as 1-gram model, where each word is a feature

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


####  term frequency-inverse document frequency (tf-idf)

In [9]:
# scikit-learn also provides a tf-idf transformer that can be used in conjunction with CountVectorizer to create a tf-idf representation of the text data
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

# Initialize the TF-IDF transformer
# use_idf=True → use inverse document frequency
# norm='l2' → normalize each vector to unit length
# smooth_idf=True → add 1 to document frequencies to avoid division by zero
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

# Set NumPy print options to show only 2 decimal places
np.set_printoptions(precision=2)

# Transform 'docs' into term-frequency matrix using 'count' (CountVectorizer),
# then convert it to TF-IDF representation and print as an array
print(tfidf.fit_transform(count.fit_transform(text)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


#### Cleaning text data

In [10]:
# Should first clean the text by stripping it of unwanted characters

# As you can see, the text contains HTML markup and punctuation that should be removed for better analysis.
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [11]:
# Use python's regular expression library to remove HTML markup and punctuation
import re

def preprocessor(text):
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    
    # Extract emoticons like :) ;-) :-D etc.
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    # Remove non-word characters, lowercase everything
    # Then append emoticons at the end without hyphens
    text = (
        re.sub(r'[\W]+', ' ', text.lower()) +
        ' '.join(emoticons).replace('-', '')
    )
    
    return text

In [12]:
# Test the prprocessor works
preprocessor(df.loc[0, 'review'][-50:])
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [13]:
# Apply preprocessor to all movie reviews in dataframe
df['review'] = df['review'].apply(preprocessor)

In [14]:
# Splits text simply into words
def tokenizer(text):
    return text.split()

In [15]:
# Another useful technique is word stemming, which is the process of transforming words to their root form
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [16]:
# Another useful trick is stop word removal, which is the process of removing common words that do not contribute to the meaning of the text, such as "the", "is", "in", etc.
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# can apply english stop word set as follows:
from nltk.corpus import stopwords

stop = stopwords.words('english')  # List of common English stop words

[w for w in tokenizer_porter('a runner likes running and runs a lot')
 if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

#### Training logistic regression model for document classification

In [18]:
# First divide dataframe of cleaned text into 25000 training and 25,000 test reviews
# First 25,000 reviews for training
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

# Remaining reviews for testing
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
# Next use GridSearchCV to find the best hyperparameters for the logistic regression model using 5-fold stratified cross-validation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TF-IDF vectorizer
# - strip_accents=None: keeps accents
# - lowercase=False: does not force lowercase (we already handle in preprocessor if needed)
# - preprocessor=None: we use our custom preprocessor/tokenizer instead
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# Define a smaller parameter grid to search over
# We test different configurations for the TfidfVectorizer + Logistic Regression
small_param_grid = [
    {
        # Unigrams only
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [None],  # No stop word removal
        'vect__tokenizer': [tokenizer, tokenizer_porter],  # try raw split and stemming
        'clf__penalty': ['l2'],  # Ridge penalty
        'clf__C': [1.0, 10.0]    # Regularization strengths
    },
    {
        # Same unigram setup, but testing stop word removal and TF vs raw counts
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],  # Try removing stopwords or not
        'vect__tokenizer': [tokenizer],    # only simple split
        'vect__use_idf': [False],          # disable IDF weighting
        'vect__norm': [None],              # disable length normalization
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    },
]

# Build a pipeline: TF-IDF vectorizer → Logistic Regression classifier
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(solver='liblinear'))  # liblinear works well for bigger datasets compared to lbfgs
])

# Grid search with 5-fold cross-validation
gs_lr_tfidf = GridSearchCV(lr_tfidf,
                           small_param_grid,
                           scoring='accuracy',  # evaluate by accuracy
                           cv=5,                # 5-fold CV
                           verbose=2,           # show progress
                           n_jobs=1)            # run in serial (could set to -1 for parallel)

# Train grid search on training set
gs_lr_tfidf.fit(X_train, y_train)



Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.6s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.6s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.6s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.6s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.7s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  48.1s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  47.6s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  48.0s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  47.7s




[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  47.5s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.8s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.8s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.8s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.8s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x000001308336F4C0>; total time=   1.8s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  47.9s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  48.2s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  49.4s




[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer_porter at 0x000001308336F600>; total time=  50.3s




In [None]:
# Print best parameter set
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
# Also print best VC and Test accuracy on test dataset
print(f'Best cross-validation accuracy: {gs_lr_tfidf.best_score_:.3f}')
print(f'Test accuracy: {gs_lr_tfidf.score(X_test, y_test):.3f}')  # Evaluate on test set


Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x00000149D3D31A80>}
Best cross-validation accuracy: 0.897
Test accuracy: 0.899


#### Working on bigger data - online algos and out-of-core learning

In [None]:
# previous section's construction of feature vectors for 50000 review dataset takes a while, so we can use out-of-core learning, which allows us to fit the classifier incrementally on chunks of data
# similar to stochastic gradient descent, where we can fit the model on small batches of data instead of the entire dataset at once, we will be using partial_fit method of the classifier to stream documents from local drive and train model incrementally
# First, define tokenizer function
import numpy as np
import re
from nltk.corpus import stopwords

# Load English stop words
stop = stopwords.words('english')

def tokenizer(text):
    """
    Custom tokenizer that:
    1. Removes HTML tags
    2. Extracts emoticons
    3. Converts to lowercase and removes non-word characters
    4. Appends emoticons back to the text
    5. Splits text into tokens
    6. Removes stopwords
    """
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)

    # Find emoticons like :-) or :D
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

    # Lowercase and remove non-word characters
    text = re.sub(r'[\W]+', ' ', text.lower())

    # Append emoticons to the end (without '-')
    text = text + ' '.join(emoticons).replace('-', '')

    # Tokenize and remove stopwords
    tokenized = [w for w in text.split() if w not in stop]

    return tokenized

In [None]:
# Next define a generator function that reads in and returns one document at a time
def stream_docs(path):
    """
    Lazily stream documents from a CSV file.

    Args:
        path (str): Path to the CSV file (expects 'review,sentiment' format).

    Yields:
        tuple: (text, label) where text is the review and label is the sentiment (int).
    """
    with open(path, 'r', encoding='utf-8') as csv:
        # Skip header row
        next(csv)
        for line in csv:
            # Extract text (all but last 3 chars: ",0" or ",1") and label (last char)
            text, label = line[:-3], int(line[-2])
            yield text, label

In [None]:
# Verify streamdocs function works
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [None]:
# Now define function get_minibatch taht takes a document stream from stream_docs and returns particular number of documments specified by size parameter
def get_minibatch(doc_stream, size):
    """
    Retrieve a minibatch of documents from a stream.

    Args:
        doc_stream (iterator): Stream of (text, label) tuples.
        size (int): Number of documents to fetch.

    Returns:
        tuple: (docs, y)
            - docs: list of review texts
            - y: list of labels
        If stream ends, returns (None, None).
    """
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)  # Get next document
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None  # End of stream
    return docs, y

In [None]:
# Cannot use CountVectorizer for out-of-core learning as it requires holding the complete vocab in memory. Can instead use HashingVectorizer, which uses a hash function to map tokens to feature indices, allowing it to handle large vocabularies without storing them in memory
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

# HashingVectorizer for transforming text into feature vectors
vect = HashingVectorizer(
    decode_error='ignore',   # Ignore decoding errors
    n_features=2**21,        # Large number of features for hashing
    preprocessor=None,       # Use default preprocessing
    tokenizer=tokenizer      # Custom tokenizer defined earlier
)

# SGDClassifier with logistic regression loss
clf = SGDClassifier(loss='log_loss', random_state=1) # use log_loss instead of log for newer versions of scikit-learn

# Create a document stream generator from the CSV dataset
doc_stream = stream_docs(path='movie_data.csv')

In [None]:
# Now can start the out-of-core learning using following code:
import pyprind
import numpy as np

# Progress bar for 45 iterations (45 minibatches)
pbar = pyprind.ProgBar(45)

# Binary classification labels
classes = np.array([0, 1])

# Train in minibatches of 1000 documents
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    
    # Break loop if no more data
    if not X_train:
        break
    
    # Transform text into feature vectors
    X_train = vect.transform(X_train)
    
    # Incrementally train classifier
    clf.partial_fit(X_train, y_train, classes=classes)
    
    # Update progress bar
    pbar.update() # using PyPrind to show progrees of learning algorithm

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:16


In [None]:
# Get a test minibatch of 5000 documents
X_test, y_test = get_minibatch(doc_stream, size=5000)

# Transform test data using the same HashingVectorizer
X_test = vect.transform(X_test)

# Evaluate accuracy of the classifier
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

Accuracy: 0.868


In [None]:
clf = clf.partial_fit(X_test, y_test) # use the last 5000 documents to update the model further
print(f'Updated accuracy: {clf.score(X_test, y_test):.3f}')

Updated accuracy: 0.894


## LatentDirichletAllocation (LDA) with SciKit-Learn

In [None]:
# In following code, will restrict analysis to 10 topics, which is a good number for most datasets, but can be adjusted
# First, load dataset into a DataFrame
import pandas as pd

# Load the movie review dataset from CSV
df = pd.read_csv('movie_data.csv', encoding='utf-8')

# On some systems, pandas may default to numeric column names; rename them for clarity
df = df.rename(columns={"0": "review", "1": "sentiment"})

# Optional: check the first few rows to verify
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [None]:
# Use count vectorizer to create a bag-of-words model
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',  # Remove common English stop words
                        max_df=.1, # Ignore terms that appear in more than 10% of the documents # rationale is that these might be common words that do not contribute to the meaning of the text
                        max_features=5000)     # Limit to 5000 most frequent words # Limit the dimensionality of dataset to improve inference performed by LDA # This and max_df are arbritrarily chosen hyperparameters
X = count.fit_transform(df['review'].values)  # Transform reviews into feature vectors


In [None]:
# Now use a LDA estimator to fit the model to the data
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,  # Number of topics
                                 learning_method='batch',  # Batch learning method (can also use 'online' for large datasets)
                                 random_state=123) 
# FYI: Sciki-Learn's LDA implementation uses expectation-maximization (EM) algorithm to estimate the parameters of the model, which is a common approach for LDA

In [None]:
# After fitting LDA, now have access to components_ attribute, which contains the topic-word distributions, meaning it stores matrix containing word importance for each of 10 topics in increasing order
lda.components_.shape

AttributeError: 'LatentDirichletAllocation' object has no attribute 'components_'

In [None]:
# to analyze results, print the 5 most important words for each topic
# Number of top words to display per topic
n_top_words = 5

# Get feature (word) names from the CountVectorizer
feature_names = count.get_feature_names_out()

# Loop through each topic in the LDA model
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}:')
    # argsort() returns indices that would sort the topic's word importance values
    # [:-n_top_words - 1:-1] slices the top n words in descending order, as importance values are sorted in ascending order
    top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] 
    print(' '.join(top_words))  # Join top words into a string and print

In [None]:
# To confirm that categories make sense, plot three movies from horror movie category# Get indices of movies with the highest probability for topic 5 (assumed "horror")
horror = X_topics[:, 5].argsort()[::-1]  # Descending order

# Print the top 3 "horror" movie reviews
for iter_idx, movie_idx in enumerate(horror[:3]):
    print(f'\nHorror movie #{(iter_idx + 1)}:')
    # Print first 300 characters of the review as a preview
    print(df['review'][movie_idx][:300], '...')