In [1]:
#Part 1: Using the TextBlob Sentiment Analyzer
import pandas as pd
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import io
import requests

In [2]:
#Import the movie review data as a data frame and ensure that the data is loaded properly.
url = 'https://drive.google.com/file/d/1TYrXIe4f7DOlZvszUmLE6H0aV17L-4Ps/view?usp=sharing'
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
response = requests.get(dwn_url).content
df = pd.read_csv(io.StringIO(response.decode('utf-8')), sep='\t', encoding='unicode_escape')

# check if there are missing values in the sentiment column
print(df['sentiment'].isnull().sum())

# check the first 5 rows to make sure the sentiment column is populated
print(df.head(5))

0
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


In [3]:
df['sentiment'].isnull().sum()

0

In [4]:
print(df['sentiment'].unique())

[1 0]


In [5]:
print(df.columns)

Index(['id', 'sentiment', 'review'], dtype='object')


In [6]:
# Check the unique values in the sentiment column
print(df['sentiment'].unique())

# Count the number of positive and negative reviews
positive_count = len(df[df['sentiment'] == 1])
negative_count = len(df[df['sentiment'] == 0])

print(f"Number of positive reviews: {positive_count}")
print(f"Number of negative reviews: {negative_count}")

[1 0]
Number of positive reviews: 12500
Number of negative reviews: 12500


In [7]:
#Use TextBlob to classify each movie review as positive or negative. 
#Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.
correct = 0
for index, row in df.iterrows():
    text = row['review']
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0 and row['sentiment'] == 'positive':
        correct += 1
    elif polarity <= 0 and row['sentiment'] == 'negative':
        correct += 1

In [8]:
# Check the accuracy of this model. Is this model better than random guessing?
accuracy = correct / len(df)
print(f"Accuracy of TextBlob: {accuracy}")

Accuracy of TextBlob: 0.0


In [9]:
from sklearn.metrics import mean_absolute_error

# calculate the predicted polarity scores using TextBlob
df['predicted_polarity'] = df['review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# calculate the MAE
mae = mean_absolute_error(df['sentiment'], df['predicted_polarity'])

print(f"MAE of TextBlob: {mae}")

MAE of TextBlob: 0.45622744952112515


In [10]:
# The MAE of TextBlob is 0.456, which means that, on average, the model's predictions are off by 0.456.
# The reason I did this is because there is an issue with the tsv file where the sentiment column keeps returning as NaN. 
# I'm thinking that the sentiment labels in the dataset are mislabeled or that the polarity scores generated by TextBlob 
# are not in the same range as the labels in the dataset

In [11]:
#For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).
analyzer = SentimentIntensityAnalyzer()

correct = 0
for index, row in df.iterrows():
    text = row['review']
    score = analyzer.polarity_scores(text)
    if score['compound'] >= 0 and row['sentiment'] == 'positive':
        correct += 1
    elif score['compound'] < 0 and row['sentiment'] == 'negative':
        correct += 1

accuracy = correct / len(df)
print(f"Accuracy of VADER: {accuracy}")

Accuracy of VADER: 0.0


In [12]:
#Part 2: Prepping Text for a Custom Model
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [13]:
# Load data from the TSV file
url = 'https://drive.google.com/file/d/1TYrXIe4f7DOlZvszUmLE6H0aV17L-4Ps/view?usp=sharing'
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
response = requests.get(dwn_url).content
df = pd.read_csv(io.StringIO(response.decode('utf-8')), sep='\t', encoding='unicode_escape')

In [14]:
# Convert all text to lowercase
df['review'] = df['review'].str.lower()

In [15]:
# Remove punctuation and special characters
df['review'] = df['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [16]:
# Remove stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kyle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [17]:
# Apply PorterStemmer
stemmer = PorterStemmer()
df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [18]:
# Create a bag-of-words matrix
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['review'])
print(f"Dimensions of bag-of-words matrix: {bow_matrix.shape}")

Dimensions of bag-of-words matrix: (25000, 92379)


In [19]:
# Create a tf-idf matrix
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])
print(f"Dimensions of tf-idf matrix: {tfidf_matrix.shape}")

Dimensions of tf-idf matrix: (25000, 92379)
