## IMDb Sentiment Analysis

In [1]:
# ONE TIME SETUP SCRIPT
# Obtain the dataset, unzip from dataset folder in local

# import tarfile
# with tarfile.open('dataset/aclImdb_v1.tar.gz', 'r:gz') as tar:
#     tar.extractall()

In [4]:
# Preprocess dataset into Pandas DataFrame
import pyprind
import pandas as pd
import os
import sys

basepath = 'dataset/aclImdb'

# labels = {'pos': 1, 'neg': 0} # Binary classification
# pbar = pyprind.ProgBar(50000, stream=sys.stdout) # Total number of reviews
# df = pd.DataFrame()
# for s in ('train', 'test'): # Iterate through train and test sets
#     for l in ('pos', 'neg'): # Iterate through train and test sets, positive and negative labels
#         path = os.path.join(basepath, s, l) # Path to the reviews
#         for file in sorted(os.listdir(path)): # Iterate through files in the path
#             with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
#                 txt = infile.read() # Read the review text
#             df = df.append([[txt, labels[l]]], ignore_index=True) # Append to DataFrame
#             pbar.update()
# df.columns = ['review', 'sentiment']

# Label mapping: positive review = 1, negative review = 0
labels = {'pos': 1, 'neg': 0}

# Progress bar for 50,000 total reviews
pbar = pyprind.ProgBar(50000, stream=sys.stdout)

data = []  # List to hold tuples of (review text, sentiment label)

# Loop through both 'test' and 'train' datasets
for s in ('test', 'train'):
    # Loop through 'pos' and 'neg' subfolders
    for l in ('pos', 'neg'):
        # Path to the specific sentiment folder
        path = os.path.join(basepath, s, l)
        
        # Loop through all review files in sorted order
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()  # Read review text
            
            # Append the review text and label to DataFrame
            # df = df.append([[txt, labels[l]]], ignore_index=True) # Depreciated append
            # Instead of appending, we collect data in a list for efficiency
            data.append((txt, labels[l]))
            
            # Update progress bar
            pbar.update()

# Create DataFrame once at the end
df = pd.DataFrame(data, columns=['review', 'sentiment'])

# Name the columns: review text and sentiment label
df.columns = ['review', 'sentiment']

In [5]:
import numpy as np
# Shuffle the DataFrame rows for randomness to split the dataset into train and test sets
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8') # VFor conveenience, save to CSV

In [6]:
import pandas as pd
# Make sure formatting is correct


# Load the movie review dataset from CSV
df = pd.read_csv('movie_data.csv', encoding='utf-8')

# Rename columns in case they were saved as "0" and "1" instead of proper names
df = df.rename(columns={"0": "review", "1": "sentiment"})

# Display the first 3 rows
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [7]:
# check that dataframe contains all 50000 reviews:
df.shape

(50000, 2)

### Bag of Words Model

In [8]:
# To construct a bag-of-words model, can use count vectorizer from sklearn, which takes array of text data and constructs a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
# Create a CountVectorizer instance
count = CountVectorizer()
# example text data
text = ['The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet']
# Fit and transform the text data to create a bag-of-words model
bag_of_words = count.fit_transform(text)

In [9]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [None]:
print(bag_of_words.toarray()) # This is also known as 1-gram model, where each word is a feature

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


###  term frequency-inverse document frequency (tf-idf)

In [13]:
# scikit-learn also provides a tf-idf transformer that can be used in conjunction with CountVectorizer to create a tf-idf representation of the text data
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

# Initialize the TF-IDF transformer
# use_idf=True → use inverse document frequency
# norm='l2' → normalize each vector to unit length
# smooth_idf=True → add 1 to document frequencies to avoid division by zero
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

# Set NumPy print options to show only 2 decimal places
np.set_printoptions(precision=2)

# Transform 'docs' into term-frequency matrix using 'count' (CountVectorizer),
# then convert it to TF-IDF representation and print as an array
print(tfidf.fit_transform(count.fit_transform(text)).toarray())

[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


### Cleaning text data