# Sentiment Analysis with LSTM and Additional Features

### Import Libraries, defining funtions and constants

In [1]:
# Paths to save the pickle files
train_features_path = "train_features.pkl"
test_features_path = "test_features.pkl"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.datasets.reuters import get_word_index

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy
import contractions
import spacy.cli

In [4]:
import pickle
import os

In [5]:
# Function to save data to a pickle file
def save_to_pickle(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

# Function to load data from a pickle file
def load_from_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [6]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
nltk.download('punkt')  # Download the punkt tokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:

# Abbreviation and apostro  phe dictionaries
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

# Copied from https://www.kaggle.com/code/gauravchhabra/nlp-twitter-sentiment-analysis-project

In [11]:
# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Initialize VADER sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

# Function to replace abbreviations and apostrophes
def expand_text(text, short_dict):
    text = contractions.fix(text)  # Expand contractions using 'contractions' library
    text = text.lower().split()
    expanded_words = []
    for word in text:
        if word in short_dict:
            expanded_words.append(short_dict[word])
        else:
            expanded_words.append(word)
    return " ".join(expanded_words)

# Function to extract additional features from the text
def extract_additional_features(texts):
    features = []

    for text in texts:
        # Expand short words and contractions
        text = expand_text(text, short_word_dict)

        # Tokenize text
        tokens = word_tokenize(text)

        # Feature 1: Length of text (in words)
        text_length = len(tokens)

        # Feature 2: Stopword ratio
        stopword_count = sum([1 for word in tokens if word in stop_words])
        stopword_ratio = stopword_count / text_length if text_length > 0 else 0

        # Feature 3: Sentiment score (using VADER sentiment analyzer)
        sentiment_score = vader_analyzer.polarity_scores(text)['compound']

        # Feature 4: Unique word ratio
        unique_word_ratio = len(set(tokens)) / text_length if text_length > 0 else 0

        # Feature 5: Named entity recognition (NER) count using spaCy
        doc = nlp(text)
        named_entities = [ent.label_ for ent in doc.ents]
        ner_count = len(named_entities)

        # Combine features into a feature vector
        features.append([
            text_length,
            stopword_ratio,
            sentiment_score,
            unique_word_ratio,
            ner_count
        ])

    return features

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load an Preprocess Data

In [12]:
vocab_size = 50000 # Load IMBD dataset with 50000 words most frequente
max_len = 200 # max length of the review

In [13]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

In [14]:
# Pad sequences to ensure uniform length
X_train_padded = pad_sequences(X_train, maxlen=max_len)
X_test_padded = pad_sequences(X_test, maxlen=max_len)

In [15]:
# Load the word index for reverse mapping
word_index = get_word_index()

# Reverse the word index for easier interpretation
reverse_word_index = {value: key for (key, value) in word_index.items()}

# Helper function to map word indices back to actual words
def decode_review(encoded_review):
    return " ".join([reverse_word_index.get(i - 3, "?") for i in encoded_review])

In [16]:
# Check if the pickle files exist, if not, generate and save them
if os.path.exists(train_features_path) and os.path.exists(test_features_path):
    # Load the features from the pickle files
    print("Loading features from pickle files...")
    train_features = load_from_pickle(train_features_path)
    test_features = load_from_pickle(test_features_path)
else:
    # Generate the features and save them
    print("Generating features and saving to pickle files...")
    train_features = extract_additional_features([decode_review(x) for x in X_train])
    test_features = extract_additional_features([decode_review(x) for x in X_test])

    # Save the generated features to pickle files
    save_to_pickle(train_features, train_features_path)
    save_to_pickle(test_features, test_features_path)

Generating features and saving to pickle files...
