# Sentiment Analysis

### Imports

In [1]:
import re
import emoji
import spacy
import string
import contractions
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, opinion_lexicon

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec
from gensim.models import FastText

### Loading Dataset
Dataset: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
df = pd.read_csv('dataset/IMDB_Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Removing for null and Deuplicated values

In [3]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [4]:
df = df.drop_duplicates()

## Lable Encoding target

In [5]:
df['sentiment'].value_counts()

sentiment
positive    24884
negative    24698
Name: count, dtype: int64

In [6]:
# LE = lable encoded
lable_map = {
    'positive' : 1,
    'negative' : 0
}
df['sentiment_LE'] = df['sentiment'].map(lable_map)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_LE'] = df['sentiment'].map(lable_map)


Unnamed: 0,review,sentiment,sentiment_LE
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [7]:
df['sentiment_LE'].value_counts()

sentiment_LE
1    24884
0    24698
Name: count, dtype: int64

## Removig
1. HTML Tags
2. Expand contractions
3. Emoji handling
4. Spelling correction
5. Lowercase + remove extra spaces
6. Tokenization + lemmatization
7. Negation handling

In [8]:
n = 2
df['review'][n]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [9]:
# Load spaCy (disable unnecessary pipeline components for speed)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
spell = Speller()

# Default custom stopwords
custom_stop_words = {
    "movie", "film"
}

# Default negations
negations = {
    "aint", "arent", "cannot", "cant", "couldnt", "darent",
    "didnt", "doesnt", "ain't", "aren't", "can't", "couldn't",
    "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt",
    "havent", "isnt", "mightnt", "mustnt", "neither", "don't", 
    "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", 
    "neednt", "needn't", "never", "none", "nope", "nor", "not",
    "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "uhuh",
    "wasnt", "werent", "oughtn't", "shan't", "shouldn't", "uh-uh",
    "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
    "wouldn't", "rarely", "seldom", "despite"
}


# Helper functions

def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def expand_contractions(text):
    return contractions.fix(text)

def handle_emoji(text):
    return emoji.demojize(text)

def correct_spelling_fast(text):
    return " ".join([str(spell(w) or w) for w in text.split()])

def tokenize(text, lemmatize=True, remove_stopwords=True, remove_punc=True,
             custom_stop_words=None, remove_custom_stop_words=True):
    if custom_stop_words is None:
        custom_stop_words = set()
    tokens = []
    doc = nlp(text)
    for token in doc:
        tok = token.text
        if lemmatize:
            tok = token.lemma_.lower()
        else:
            tok = tok.lower()
        if remove_punc and token.is_punct:
            continue
        if remove_stopwords and token.is_stop:
            continue
        if remove_custom_stop_words and tok in custom_stop_words:
            continue
        tokens.append(tok)
    return tokens


def handle_negations(tokens, combine=True):
    """
    If combine=True, combines negation with next token: 'not good' -> 'not_good'
    If combine=False, keeps tokens separate: ['not', 'good']
    """
    if not combine:
        return tokens
    
    result = []
    i = 0
    while i < len(tokens):
        tok = tokens[i]
        if tok in negations and i < len(tokens) - 1:
            result.append(tok + "_" + tokens[i+1])
            i += 2
        else:
            result.append(tok)
            i += 1
    return result


# Main customizable preprocessing function

def preprocess_text(
    text,
    remove_html=True,
    expand_contr=True,
    handle_emoji_flag=True,
    spelling_correction=False,
    lowercase=True,
    lemmatize=True,
    remove_stopwords=True,
    remove_punc=True,
    custom_stop_words=None,
    remove_custom_stop_words=True,
    negation_combine=True
):
    # 1. HTML
    if remove_html:
        text = clean_html(text)
    
    # 2. Expand contractions
    if expand_contr:
        text = expand_contractions(text)
    
    # 3. Emoji handling
    if handle_emoji_flag:
        text = handle_emoji(text)
    
    # 4. Spelling correction
    if spelling_correction:
        text = correct_spelling_fast(text)
    
    # 5. Lowercase + remove extra spaces
    if lowercase:
        text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 6. Tokenization + lemmatization
    tokens = tokenize(text, lemmatize=lemmatize, remove_stopwords=remove_stopwords,
                      remove_punc=remove_punc, custom_stop_words=custom_stop_words, remove_custom_stop_words=remove_custom_stop_words)
    
    # 7. Negation handling
    tokens = handle_negations(tokens, combine=negation_combine)
    
    return tokens


# Example usage

sample_text = "<p>I didn't like this movie at all! 😡!!! The moovie was abolutly amaziing but the actrs were dissapointng.</p>"

tokens = preprocess_text(
    sample_text,
    remove_html=True,
    expand_contr=True,
    handle_emoji_flag=True,
    spelling_correction=True,
    lowercase=True,
    lemmatize=True,
    remove_stopwords=True,
    remove_punc=True,
    custom_stop_words=custom_stop_words,
    remove_custom_stop_words=True,
    negation_combine=True
)

tokens

['like', 'enraged_face', 'absolutely', 'amazing', 'actor', 'dissapointng']

In [13]:
%%time
preprocess_text(
    df['review'][n],
    remove_html=True,
    expand_contr=False,
    handle_emoji_flag=True,
    spelling_correction=False,
    lowercase=True,
    lemmatize=True,
    remove_stopwords=True,
    remove_punc=True,
    custom_stop_words=custom_stop_words,
    remove_custom_stop_words=True,
    negation_combine=False
)

CPU times: total: 31.2 ms
Wall time: 39.5 ms


['think',
 'wonderful',
 'way',
 'spend',
 'time',
 'hot',
 'summer',
 'weekend',
 'sit',
 'air',
 'condition',
 'theater',
 'watch',
 'light',
 'hearted',
 'comedy',
 'plot',
 'simplistic',
 'dialogue',
 'witty',
 'character',
 'likable',
 'bread',
 'suspect',
 'serial',
 'killer',
 'disappoint',
 'realize',
 'match',
 'point',
 '2',
 'risk',
 'addiction',
 'think',
 'proof',
 'woody',
 'allen',
 'fully',
 'control',
 'style',
 'grow',
 'love.this',
 'laugh',
 'woody',
 'comedy',
 'year',
 'dare',
 'decade',
 'impress',
 'scarlet',
 'johanson',
 'manage',
 'tone',
 'sexy',
 'image',
 'jump',
 'right',
 'average',
 'spirited',
 'young',
 'woman.this',
 'crown',
 'jewel',
 'career',
 'witty',
 'devil',
 'wear',
 'prada',
 'interesting',
 'superman',
 'great',
 'comedy',
 'friend']

In [None]:
df['processed_review'] = df['review'].apply(lambda review: " ".join(preprocess_text(
    review,
    remove_html=True,
    expand_contr=False,
    handle_emoji_flag=True,
    spelling_correction=False,
    lowercase=True,
    lemmatize=True,
    remove_stopwords=True,
    remove_punc=True,
    custom_stop_words=custom_stop_words,
    remove_custom_stop_words=True,
    negation_combine=False
)))

In [None]:
df['processed_review'][2]

# Word clouds

In [None]:
all_raw_text = ' '.join(df['review'].astype(str))
all_cleaned_text = ' '.join(df['processed_review'].astype(str))

all_pos_raw_text = ' '.join(df.loc[df['sentiment_LE'] == 1, 'review'].astype(str))
all_neg_raw_text = ' '.join(df.loc[df['sentiment_LE'] == 0, 'review'].astype(str))

all_pos_cleaned_text = ' '.join(df.loc[df['sentiment_LE'] == 1, 'processed_review'].astype(str))
all_neg_cleaned_text = ' '.join(df.loc[df['sentiment_LE'] == 0, 'processed_review'].astype(str))

### Raw reviwes

In [None]:
## raw text
wordcloud = WordCloud(
    width=1000, height=500,
    background_color='black',
    max_words=200
).generate(all_raw_text)

plt.figure(figsize=(12,6), facecolor='black')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
del all_raw_text

In [None]:
## all_pos_raw_text text
wordcloud = WordCloud(
    width=1000, height=500,
    background_color='black',
    max_words=200
).generate(all_pos_raw_text)

plt.figure(figsize=(12,6), facecolor='black')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
del all_pos_raw_text

In [None]:
## all_neg_raw_text text
wordcloud = WordCloud(
    width=1000, height=500,
    background_color='black',
    max_words=200
).generate(all_neg_raw_text)

plt.figure(figsize=(12,6), facecolor='black')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
del all_neg_raw_text

### Preped reviwes

In [None]:
## cleaned text
wordcloud = WordCloud(
    width=1000, height=500,
    background_color="black",
    max_words=200
).generate(all_cleaned_text)

plt.figure(figsize=(12,6), facecolor='black')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
del all_cleaned_text

In [None]:
## all_pos_cleaned_text text
wordcloud = WordCloud(
    width=1000, height=500,
    background_color='black',
    max_words=200
).generate(all_pos_cleaned_text)

plt.figure(figsize=(12,6), facecolor='black')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
del all_pos_cleaned_text

In [None]:
## all_neg_cleaned_text text
wordcloud = WordCloud(
    width=1000, height=500,
    background_color='black',
    max_words=200
).generate(all_neg_cleaned_text)

plt.figure(figsize=(12,6), facecolor='black')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
del all_neg_cleaned_text

## Countig number of words

In [None]:
df['processed_review_word_len'] = df['processed_review'].apply(lambda x: len(x.split()))
df['processed_review_word_len']

In [None]:
sns.histplot(df['processed_review_word_len'])
plt.show()

## counting positive and negative words

In [None]:
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())
def count_positive_words(sentence):
    doc = nlp(sentence)
    pos_count = 0
    for token in doc:
        word = token.text.lower()
        if word in positive_words:
            pos_count += 1
    return pos_count

def count_negative_words(sentence):
    doc = nlp(sentence)
    neg_count = 0
    for token in doc:
        word = token.text.lower()
        if word in negative_words:
            neg_count += 1
    return neg_count

In [None]:
n = 1
df['processed_review'][n], df['sentiment'][n]

In [None]:
pos = count_positive_words(df['processed_review'][n])
neg = count_negative_words(df['processed_review'][n])
pos, neg

In [None]:
pos_counts = df['processed_review'].apply(count_positive_words)
pos_counts

In [None]:
neg_counts = df['processed_review'].apply(count_negative_words)
neg_counts

In [None]:
pos_neg_ratio = pos_counts / neg_counts
pos_neg_ratio

In [None]:
df['pos_neg_ratio'] = pos_neg_ratio
del pos_neg_ratio
df['pos_counts'] = pos_counts
del pos_counts
df['neg_counts'] = neg_counts
del neg_counts

In [None]:
df[['sentiment', 'pos_neg_ratio', 'pos_counts', 'neg_counts']]

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True)
plt.show()

In [None]:
df.to_csv('dataset/preped/preped_df.csv', index=False)

## TfidfVectorizer

In [None]:
tfid = TfidfVectorizer(
    max_features=20000, 
    min_df=10,
    ngram_range=(1,3),
    sublinear_tf=True
)
tfid_vecs = tfid.fit_transform(df['processed_review'])

In [None]:
tfid_vecs.get_shape()

In [None]:
tfid.get_feature_names_out()

In [None]:
tfid_df = pd.DataFrame(tfid_vecs.toarray(), columns=tfid.get_feature_names_out())
tfid_df['sentiment_LE'] = df['sentiment_LE']
tfid_df['pos_neg_ratio'] = df['pos_neg_ratio']
tfid_df['pos_counts'] = df['pos_counts']
tfid_df['neg_counts'] = df['neg_counts']
tfid_df

In [None]:
tfid_df.to_csv('dataset/preped/tfid_df_max_feature=20000_min_df=10_ngram_range=(1,3)_sublinear_tf=True.csv', dtype='float32', index=False)
del tfid_df

In [None]:
del tfid, tfid_vecs

In [None]:
# number of rows per chunk
chunksize = 10000
n = 20  

# accumulators
pos_sum = None
neg_sum = None

use_cols = None  # to fix column order after first chunk

for chunk in pd.read_csv('dataset/preped/tfid_df.csv', dtype='float32', chunksize=chunksize):
    if use_cols is None:
        use_cols = [c for c in chunk.columns if c not in ['sentiment_LE', 'pos_neg_ratio', 'pos_counts', 'neg_counts']]
        pos_sum = pd.Series(0, index=use_cols, dtype='float32')
        neg_sum = pd.Series(0, index=use_cols, dtype='float32')

    # positive rows
    pos_rows = chunk[chunk['sentiment_LE'] == 1][use_cols]
    pos_sum = pos_sum.add(pos_rows.sum(axis=0), fill_value=0)

    # negative rows
    neg_rows = chunk[chunk['sentiment_LE'] == 0][use_cols]
    neg_sum = neg_sum.add(neg_rows.sum(axis=0), fill_value=0)

# take top-N
pos_word_freq = pos_sum.sort_values(ascending=False).head(n)
neg_word_freq = neg_sum.sort_values(ascending=False).head(n)

In [None]:
# Plot Positive words
plt.figure(figsize=(10, 6))
sns.barplot(x=pos_word_freq.values, y=pos_word_freq.index, hue=pos_word_freq.values)
plt.title("Top Positive Words by TF-IDF")
plt.xlabel("TF-IDF Weight")
plt.ylabel("Word")
plt.show()

In [None]:
del pos_word_freq

In [None]:
# Plot Negative words
plt.figure(figsize=(10, 6))
sns.barplot(x=neg_word_freq.values, y=neg_word_freq.index, hue=neg_word_freq.values)
plt.title("Top Negative Words by TF-IDF")
plt.xlabel("TF-IDF Weight")
plt.ylabel("Word")
plt.show()

In [None]:
del neg_word_freq

## Word2vec

In [None]:
df = pd.read_csv('dataset/preped/preped_df.csv')

In [None]:
sentences = [sentence.split() for sentence in df['processed_review']]
sentences[0]

In [None]:
w2v = Word2Vec(sentences=sentences, vector_size=500, window=12, min_count=5, sg=1, epochs=20, workers=4)

In [None]:
w2v.save('models/w2v/word2vac_vector_size=500_window=12_min_count=5_sg=1_epochs=20_workers=4.model')

In [None]:
w2v.wv.most_similar('smart')

In [None]:
w2v.wv.most_similar("good")

In [None]:
w2v.wv.most_similar(positive=["king", "female"], negative=["male"])

In [None]:
w2v.wv.similarity("king", "female"), w2v.wv.similarity("queen", "female") 

In [None]:
r = w2v.wv['king'] - w2v.wv['male'] + w2v.wv['female']
w2v.wv.similar_by_vector(r)

In [None]:
rng = np.random.default_rng(42)
pad_vec = rng.normal(scale=0, size=w2v.vector_size).astype(np.float32)
oov_vec = rng.normal(scale=0.01, size=w2v.vector_size).astype(np.float32)

np.save('embeddings/pad_vec.npy', pad_vec)
np.save('embeddings/oov_vec.npy', oov_vec) 

In [None]:
def get_w2v_padded_embeddings(sentence, max_words):
    model = w2v
    embedding_dim = model.vector_size
    embeddings = []
    
    for word in sentence:
        # in vocab
        if word in model.wv:
            embeddings.append(model.wv[word])
        # out of vocab
        else:
            embeddings.append(oov_vec)

    # tuncate
    if len(embeddings) > max_words:
        embeddings = embeddings[:max_words]
    # paddinf
    else:
        while len(embeddings) < max_words:
            embeddings.append(pad_vec)

    return np.array(embeddings)

In [None]:
a = get_w2v_padded_embeddings('absolutly hate coding'.split(), max_words=4)
a

In [None]:
np.array_equal(a[2], oov_vec), np.array_equal(a[3], pad_vec)

In [None]:
get_w2v_padded_embeddings(sentences[0], max_words=100)

## FastText 

In [None]:
ft = FastText(sentences=sentences, vector_size=500, window=12, min_count=5, sg=1, epochs=20, workers=4)

In [None]:
ft.save('models/FastText/FastText_vector_size=500_window=12_min_count=5_sg=1_epochs=20_workers=4.model')

In [None]:
ft.wv.most_similar('smart')

In [None]:
ft.wv.most_similar('good')

In [None]:
ft.wv.most_similar('bitch')

In [None]:
ft.wv.most_similar(positive=["king", "female"], negative=["male"])

In [None]:
r = ft.wv['king'] - ft.wv['male'] + ft.wv['female']
ft.wv.similar_by_vector(r)

In [None]:
def get_ft_padded_embeddings(sentence, max_words):
    model = ft
    embedding_dim = model.vector_size
    embeddings = []
    
    for word in sentence:
        embeddings.append(model.wv[word])

    # tuncate
    if len(embeddings) > max_words:
        embeddings = embeddings[:max_words]
    # paddinf
    else:
        while len(embeddings) < max_words:
            embeddings.append(pad_vec)

    return np.array(embeddings)

In [None]:
a = get_ft_padded_embeddings('absolutly hate coding'.split(), max_words=4)
a

In [None]:
get_ft_padded_embeddings(sentences[0], max_words=100)

## BERT Encoder

In [None]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
text = ' '.join(['think', 'wonderful', 'way', 'spend', 'time', 'hot', 'summer', 'weekend', 'sit', 'air', 'condition', 'theater', 'watch', 'light', 'hearted', 'comedy', 'plot',
 'simplistic', 'dialogue', 'witty', 'character', 'liable', 'bread', 'suspect', 'serial', 'killer', 'disappoint', 'realize', 'match', 'point', '2', 'risk', 'addiction', 'think',
 'proof', 'woody', 'allen', 'fully', 'control', 'style', 'grow', 'love', 'this', 'laugh', 'woody', 'comedy', 'year', 'dare', 'decade', 'impress', 'scarlet', 'johnson', 'manage',
 'tone', 'sexy', 'image', 'jump', 'right', 'average', 'spirited', 'young', 'woman', 'this', 'crown', 'jewel', 'career', 'winter', 'devil', 'wear', 'prada', 'interesting', 'superman',
 'great', 'comedy', 'friend'])

encoded_input = tokenizer.batch_encode_plus( [text],# List of input texts
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True    # Add special tokens CLS and SEP
)

input_ids = encoded_input['input_ids']  # Token IDs
# print input IDs
print(f"Input ID: {input_ids}")
attention_mask = encoded_input['attention_mask']  # Attention mask
# print attention mask
print(f"Attention mask: {attention_mask}")

In [None]:
model = BertModel.from_pretrained("bert-base-uncased")
outputs = model(**encoded_input)
# The embeddings are in the `last_hidden_state` attribute of the outputs
token_embeddings = outputs.last_hidden_state