Movie Sentiment Analyzer by Timothy Gunn <br> July 5th, 2025 <br>
This code extracts movie sentiment data provided in .tar format and creates an MLP using that data to predict whether a given movie review has an overall positive or negative sentiment. To train the model, users can download the data provided by Stanford at https://ai.stanford.edu/~amaas/data/sentiment/ . At the end, an extra cell of code is provided so the user can input their own review into the model and test it out themselves.


In [None]:
#Uploading the file (a .tar) that contains the data
from google.colab import files
sentiments = files.upload()


In [None]:
#Extracting the data from the .tar file
import tarfile

filename = "aclImdb_v1.tar"

with tarfile.open(filename, "r") as tar:
    tar.extractall()

In [None]:
import os

#Loads data from a given path and creates a list of text, label (1 or 0 for positive and negative respectivley) pairs
def load_reviews(path, label):
  data = []
  for fname in os.listdir(path):
    if fname.endswith(".txt"):
      with open(os.path.join(path, fname), encoding="utf-8") as f:
        text = f.read()
        data.append((text,label))
  return data

#Loads 1000 positive reviews
train_pos = load_reviews("aclImdb/train/pos", 1)[:1000]

#Loads 1000 negative reviews
train_neg = load_reviews("aclImdb/train/neg", 0)[:1000]

#Combines the positive and negative reviews into one larger dataset
data = train_pos + train_neg

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#Creates individual lists for the review texts and the labels associates with them
texts = [text for text, label in data]
labels = [label for text, label in data]

#Splitting the data into training and testing sets (stratifying based on the labels to ensure one set does not get a higher proportion of
# positives or negatives)
X_train, X_test, Y_train, Y_test = train_test_split(texts, labels, test_size=.2, random_state=42, stratify=labels)

#Initialize tge TF-IDF vectorizer with a limit of 5000 features (words) and English stop words removed (like a, and, but, the)
vectorizer = TfidfVectorizer(max_features=5000, stop_words = 'english')

#Fit the vectorizer to the training text and transform it into a numerical feature vector
X_train_vectorized = vectorizer.fit_transform(X_train).toarray()

#Transform the test text into a numerical feature vector (without refitting the vectorizer)
X_test_vectorized = vectorizer.transform(X_test).toarray()

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

#Convert vectorized training features into a Float Tensor
X_train_tensor = torch.tensor(X_train_vectorized, dtype=torch.float32)

#Convert vectorized training features into a Float Tensor and add a dimension (To make them column vectors for BCE)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32).unsqueeze(1)

#Perform the same operations on the test data
X_test_tensor = torch.tensor(X_test_vectorized, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32).unsqueeze(1)

#Combine the features and labels into a PyTorch dataset object
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

#Creates data loaders for both the training and testing sets which will shuffle data each epoch
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)


In [None]:
import torch.nn as nn

#Define an MLP class
class MLP(nn.Module):
  def __init__(self, input_size):
    super(MLP, self).__init__()
    #Defining the layers of the MLP (ReLU introduces non-linearity, Sigmoid for classification at the end)
    self.model = nn.Sequential(
        nn.Linear(input_size, 128),
        nn.ReLU(),
        nn.Linear(128,64),
        nn.ReLU(),
        nn.Linear(64,1),
        nn.Sigmoid()
    )

  #Ensures the neural network is feed-forward
  def forward(self, x):
    return self.model(x)

#Creates an instance of the model
model = MLP(input_size=5000)


In [None]:
import torch.optim as optim

#Definie the loss function and optimizer to be used during training
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = .005)

#Train the model using 5 epochs of data
for epoch in range(5):
  total_loss = 0
  #Iterate through each mini batch in the data loader
  for batch_x, batch_y in train_loader:
    #Clear previous gradients
    optimizer.zero_grad()
    #Compute predicted outputs
    outputs = model(batch_x)
    #Calculate loss
    loss = criterion(outputs, batch_y)
    #Backpropagation
    loss.backward()
    #Gradient Descent
    optimizer.step()
    #Update loss
    total_loss += loss.item()
  print(f"Epoch: {epoch+1}, Loss: {total_loss:.4f}")



In [None]:
#Set the model to evaluation mode
model.eval()

#Disable gradient tracking to save memory and speed up computatiom
with torch.no_grad():
  #Get predicted probabilities on the test set
  predictions = model(X_test_tensor)
  #Convert probabilities to binary class predictions
  predicted_classes = (predictions >= .5).float()
  #Compute model accuracy by comparing predicted labels to true labels and counting correct predictions
  accuracy = (predicted_classes.eq(Y_test_tensor)).sum().item() / len(Y_test_tensor)
  print(f"Accuracy: {accuracy: }")

In [2]:
def predicted_review_sentiment(review):
  #Set model to evaluation mode
  model.eval()
  #Disable gradient tracking
  with torch.no_grad():
    #Vectorize the input text
    vectorized = vectorizer.transform([review]).toarray()
    #Convert to Float Tensor
    input_tensor = torch.tensor(vectorized, dtype=torch.float32)
    #Get model prediction
    prediction = model(input_tensor)
    #Convert probability to binary class prediction
    predicted_class = (predictions >= .5).float().item()

    if predicted_class == 1:
      sentiment = 'Positive'
    else:
      sentiment = 'Negative'

    print(f'Review: {review}\n')
    print(f'Prediceted Sentiment: {sentiment}\n')


user_input = input('Enter a movie review to be input into the model: ')
predicted_review_sentiment(user_input)
