#**Twitter Sentiment Analysis Project 🎯**

Welcome to this Twitter Sentiment Analysis project! 🌟 This notebook demonstrates how to analyze the sentiments of tweets, classifying them into categories like positive, negative, or neutral. By leveraging the power of NLP (Natural Language Processing) techniques and machine learning models, we'll uncover patterns and insights from social media conversations.





**Key Highlights:
Dataset: Real-world tweets collected from Twitter.
Objective: Analyze sentiments to understand public opinion on a topic or event.
Tech Stack: Python, Pandas, NLTK, Scikit-learn, and more.
Steps:
Data collection and preprocessing (removing noise, tokenization, etc.).
Sentiment classification using machine learning algorithms.**

#### Import necessary packages
You may import more packages here.

In [None]:
!pip install emoji



In [None]:
!pip install keras
!pip install tensorflow==2.12




In [None]:
# Import necessary packages
import re
from os.path import join
import numpy as np
import emoji
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from copy import deepcopy
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
import os

import torch
import torch.nn as nn
from torchtext.vocab import Vectors
import torch.optim as optim






In [None]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [None]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

#### Load training set, dev set and testing set
Here, you need to load the training set, the development set and the test set. For better classification results, you may need to preprocess tweets before sending them to the classifiers.

unzipping semeval tweets folder

In [None]:
#!tar -xvjf /content/semeval-tweets.tar.bz2 -C /content/


semeval-tweets/
semeval-tweets/twitter-training-data.txt
semeval-tweets/twitter-dev-data.txt
semeval-tweets/twitter-test3.txt
semeval-tweets/twitter-test2.txt
semeval-tweets/twitter-test1.txt


In [None]:
# Load training set, dev set and testing set
def load_in_dataframe(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['Tweet_id', 'Sentiment', 'Tweet'])
    return df

train_tw_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-training-data.txt')
dev_tw_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-dev-data.txt')
test1_tw_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test1.txt')
test2_tw_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test2.txt')
test3_tw_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test3.txt')

train_tw_df = load_in_dataframe(train_tw_path)
dev_tw_df = load_in_dataframe(dev_tw_path)
test1_tw_df = load_in_dataframe(test1_tw_path)
test2_tw_df = load_in_dataframe(test2_tw_path)
test3_tw_df = load_in_dataframe(test3_tw_path)





#DATA PREPROCESSING

Downloading necessary nltk files

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Nikhil
[nltk_data]     Wadhwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nikhil
[nltk_data]     Wadhwa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nikhil Wadhwa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to C:\Users\Nikhil
[nltk_data]     Wadhwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Creating a data preprocessing function to improve quality of data extracted and model's accuracy could be improved.**

In [None]:
def preprocess_data(file_path):


    test_data = pd.read_csv(file_path, delimiter='\t', header=None, names=['TweetID', 'Sentiment', 'Tweet'])

    # Map Sentiment labels
    #test_data['Sentiment'] = test_data['Sentiment'].map({'positive': 1, 'negative': -1, 'neutral': 0})

    # Lowercase
    test_data['Tweet'] = test_data['Tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

    # Remove URLs
    test_data['Tweet'] = test_data['Tweet'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))

    # Replace mentions but keep hashtags
    test_data['Tweet'] = test_data['Tweet'].apply(lambda x: re.sub(r'@\w+', '@user', x))

    # Convert emojis to text
    test_data['Tweet'] = test_data['Tweet'].apply(lambda x: emoji.demojize(x, delimiters=("", "")))

    # Remove non-alphanumeric characters except spaces
    test_data['Tweet'] = test_data['Tweet'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    def remove_punctuation(text):
    # This regular expression matches any punctuation character
      return re.sub(r'[^\w\s]', '', text)

    def remove_only_digits(text):
    # This regular expression matches whole numbers
      return re.sub(r'\b\d+\b', '', text)
    test_data['Tweet'] = test_data['Tweet'].apply(remove_punctuation)
    test_data['Tweet'] = test_data['Tweet'].apply(remove_only_digits)



    # Tokenization
    test_data['Tokens'] = test_data['Tweet'].apply(word_tokenize)

    lemmatizer = WordNetLemmatizer()



    # Lemmatize with POS tagging
    def get_wordnet_pos(tag):
        """Convert POS tag from Penn to WordNet."""
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def lemmatize_with_pos(tokens):
        tagged = nltk.pos_tag(tokens)
        lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag) if get_wordnet_pos(tag) else wordnet.NOUN) for word, tag in tagged if len(word) > 1 and not word.isdigit()]
        return lemmatized

    test_data['Lemmatized'] = test_data['Tokens'].apply(lemmatize_with_pos)

    return test_data

**Applying preprocessing function to data**

In [None]:
train_data = preprocess_data(train_tw_path)
dev_data = preprocess_data(dev_tw_path)
test_data_1 = preprocess_data(test1_tw_path)
test_data_2 = preprocess_data(test2_tw_path)
test_data_3 = preprocess_data(test3_tw_path)

extracting x_train and y_train from the data , also creating variables for x_dev , y_dev which will be used for validating model.

In [None]:
x_train = train_data['Tweet']
y_train = train_data['Sentiment']
x_dev = dev_data['Tweet']
y_dev = dev_data['Sentiment']


In [None]:
test_datasets = {
    test1_tw_path: test_data_1,
    test2_tw_path: test_data_2,
    test3_tw_path: test_data_3,
}

#### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

In [None]:
# Buid traditional sentiment classifiers. An example classifier name 'svm' is given
# in the code below. You should replace the other two classifier names
# with your own choices. For features used for classifier training,
# the 'bow' feature is given in the code. But you could also explore the
# use of other features.
for classifier in ['naive_bayes', 'svm', 'logistic_regression','lstm']:
    for features in ['bow']:
        if classifier=='naive_bayes' and features=='bow':
          print('Training  '+ classifier + ' with ' + features + ' technique ')
          naive_bow_pipeline = Pipeline([('vectorizer',CountVectorizer(stop_words='english')),('classifier',MultinomialNB())])
          naive_bow_pipeline.fit(x_train, y_train)
          #predicting on development set
          y_pred_dev = naive_bow_pipeline.predict(x_dev)
          print (classification_report(y_dev,y_pred_dev))
          print('Time to evaluate naive bayes on test sets')
          for testset_path , testset_dataframe in test_datasets.items():
            x_test = testset_dataframe['Tweet']
            y_test = testset_dataframe['Sentiment']
            y_pred = naive_bow_pipeline.predict(x_test)
            id_preds={}
            id_preds = {str(tweet_id): pred for tweet_id, pred in zip(testset_dataframe['TweetID'], y_pred)}
            evaluate(id_preds,testset_path,classifier)


        # Skeleton: Creation and training of the classifiers
        elif classifier == 'svm' and features=='bow':
          print('Training  '+ classifier + ' with ' + features + ' technique ')
          svm_bow_pipeline = Pipeline([('vectorizer',CountVectorizer(stop_words='english')),('classifier',SVC(kernel='linear'))])
          svm_bow_pipeline.fit(x_train, y_train)
          #predicting on development set
          y_pred_dev = svm_bow_pipeline.predict(x_dev)
          print (classification_report(y_dev,y_pred_dev))
          print('Time to evaluate Svm on test sets')
          for testset_path , testset_dataframe in test_datasets.items():
            x_test = testset_dataframe['Tweet']
            y_test = testset_dataframe['Sentiment']
            y_pred = svm_bow_pipeline.predict(x_test)
            id_preds={}
            id_preds = {str(tweet_id): pred for tweet_id, pred in zip(testset_dataframe['TweetID'], y_pred)}
            evaluate(id_preds,testset_path,classifier)

        elif classifier == 'logistic_regression' and features=='bow':
          print('Training  '+ classifier + ' with ' + features + ' technique ')
          lr_bow_pipeline = Pipeline([('vectorizer',CountVectorizer(stop_words='english')),('classifier',LogisticRegression(max_iter=1000))])
          lr_bow_pipeline.fit(x_train, y_train)
          #predicting on development set
          y_pred_dev = lr_bow_pipeline.predict(x_dev)
          print (classification_report(y_dev,y_pred_dev))
          print('Time to evaluate logistic regression on test sets')
          for testset_path , testset_dataframe in test_datasets.items():
            x_test = testset_dataframe['Tweet']
            y_test = testset_dataframe['Sentiment']
            y_pred = lr_bow_pipeline.predict(x_test)
            id_preds={}
            id_preds = {str(tweet_id): pred for tweet_id, pred in zip(testset_dataframe['TweetID'], y_pred)}
            evaluate(id_preds,testset_path,classifier)


        elif classifier =='lstm':

          print(' Training '+ classifier + ' with glove embeddings technique ')
          words = dict()
          def create_to_dict(d,filename):

            with open(filename,'r',encoding='utf-8') as f :

              for line in f.readlines():

                line = line.split(' ')
                try :

                  d[line[0]] = np.array(line[1:] , dtype = float)
                except:

                  continue

          create_to_dict(words,'C:/Users/Nikhil Wadhwa/glove.6B/glove.6B.50d.txt')
          tokenizer = nltk.RegexpTokenizer(r"\w+")
          lemmatizer = WordNetLemmatizer()
          def tweet_to_token_list(sentence):


            tokens = tokenizer.tokenize(sentence)
            tokens_lowercase = [t.lower() for t in tokens]
            lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens_lowercase]
            imp_tokens = [t for t in lemmatized_tokens if t in words]
            return imp_tokens
          def tweet_to_word_vectors(tweet, word_dict=words):


            processed_list_of_tokens = tweet_to_token_list(tweet)

            vectors = []

            for token in processed_list_of_tokens:


              if token not in word_dict:

                continue

              token_vector = word_dict[token]
              vectors.append(token_vector)

            return np.array(vectors, dtype=float)
          def df_to_X_y(dataframe_conv):


            y = dataframe_conv['Sentiment']
            le = LabelEncoder()
            y=le.fit_transform(dataframe_conv['Sentiment'])

            all_word_vector_sequences = []

            for tweet in dataframe_conv['Tweet']:
              tweet_as_vector_seq = tweet_to_word_vectors(tweet)
              if tweet_as_vector_seq.shape[0] == 0:
                tweet_as_vector_seq = np.zeros(shape=(1, 50))

              all_word_vector_sequences.append(tweet_as_vector_seq)

            return all_word_vector_sequences, y
          X_train, y_train = df_to_X_y(train_data)
          X_val , y_val = df_to_X_y(dev_data)
          def pad_X(X, desired_sequence_length=242):
            X_copy = deepcopy(X)
            for i, x in enumerate(X):
              x_seq_len = x.shape[0]
              sequence_length_difference = desired_sequence_length - x_seq_len
              padding = np.zeros(shape=(sequence_length_difference, 50))
              X_copy[i] = np.concatenate([x, padding])
            return np.array(X_copy).astype(float)
          X_train = pad_X(X_train,desired_sequence_length=242)
          X_val = pad_X(X_val)
          model = Sequential([])

          model.add(layers.Input(shape=(242, 50)))
          model.add(layers.LSTM(64, return_sequences=True))
          model.add(layers.Dropout(0.2))
          model.add(layers.LSTM(64, return_sequences=True))
          model.add(layers.Dropout(0.2))
          model.add(layers.LSTM(64, return_sequences=True))
          model.add(layers.Dropout(0.2))
          model.add(layers.Flatten())
          model.add(layers.Dense(3, activation='softmax'))
          model.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=['accuracy'])
          model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), batch_size=32)
          predictions = model.predict(X_val)
          predicted_classes = np.argmax(predictions, axis=1)
          print(classification_report(y_val, predicted_classes, target_names=['positive', 'neutral', 'negative']))
          for testset_path , testset_dataframe in test_datasets.items():
            X_test, y_test = df_to_X_y(testset_dataframe)
            X_test = pad_X(X_test)
            y_test_pred = model.predict(X_test)
            y_test_pred_class = np.argmax(y_test_pred,axis=1)
            label_mapping = {0: 'positive', 1: 'neutral', 2: 'negative'}
            y_test_pred_labels = [label_mapping[pred] for pred in y_test_pred_class]
            testset_dataframe['TweetID'] = testset_dataframe['TweetID'].astype(str)
            id_preds={}
            id_preds = {str(tid): pred for tid, pred in zip(testset_dataframe['TweetID'], y_test_pred_labels)}
            evaluate(id_preds,testset_path,classifier)


Training  naive_bayes with bow technique 
              precision    recall  f1-score   support

    negative       0.61      0.47      0.53       378
     neutral       0.62      0.63      0.63       919
    positive       0.62      0.68      0.65       703

    accuracy                           0.62      2000
   macro avg       0.62      0.59      0.60      2000
weighted avg       0.62      0.62      0.62      2000

Time to evaluate naive bayes on test sets
C:\Users\Nikhil Wadhwa\semeval-tweets\twitter-test1.txt (naive_bayes): 0.438
C:\Users\Nikhil Wadhwa\semeval-tweets\twitter-test2.txt (naive_bayes): 0.450
C:\Users\Nikhil Wadhwa\semeval-tweets\twitter-test3.txt (naive_bayes): 0.445
Training  svm with bow technique 
              precision    recall  f1-score   support

    negative       0.54      0.52      0.53       378
     neutral       0.60      0.65      0.62       919
    positive       0.67      0.62      0.65       703

    accuracy                           0.61      200

#**lstm using pytorch and Glove word embeddings**

In [None]:
#lstm using pytorch

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\Nikhil
[nltk_data]     Wadhwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LOADING GLOVE EMBEDDINGS

In [None]:
def load_embeddings_from_file(embeddings_file_path):
    """Load word embeddings from a given file.

    Args:
        embeddings_file_path (str): The file path to the pre-trained word embeddings.

    Returns:
        dict: A dictionary where keys are words and values are their corresponding embedding vectors.
    """
    embeddings_dictionary = {}
    with open(embeddings_file_path, 'r', encoding="utf8") as file:
        for line in file:
            parts = line.strip().split()  # Improved naming and strip leading/trailing spaces
            word = parts[0]
            vector = np.asarray(parts[1:], dtype="float32")
            embeddings_dictionary[word] = vector
    return embeddings_dictionary

In [None]:
glove_embeddings= load_embeddings_from_file(os.path.join(os.getcwd(), 'glove.6B', 'glove.6B.100d.txt'))


In [None]:
x_train_tokens = x_train.apply(lambda x: word_tokenize(x.lower()))
x_val_tokens = x_dev.apply(lambda x: word_tokenize(x.lower()))

In [None]:
#mapping these words and tokenising

In [None]:
def create_word_index_mapping(tokenized_texts):
    """
    Create a mapping from words to unique integer indices.

    Parameters:
    - tokenized_texts: A list of lists, where each inner list contains tokens from a single document.

    Returns:
    - word_index: A dictionary mapping each unique token to a unique integer.
    """
    word_index = {}
    current_index = 1
    for tokens in tokenized_texts:
        for token in tokens:
            if token not in word_index:
                word_index[token] = current_index
                current_index += 1
    return word_index

# Example usage:
# Assuming X_train_tokens is a list of lists where each inner list is a tokenized text from your training data.
# X_train_tokens = [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'sentence'], ...]

word_index_mapping = create_word_index_mapping(x_train_tokens)


converting into sequence

In [None]:
x_train_seq = [[word_index_mapping[token] for token in tokens if token in word_index_mapping] for tokens in x_train_tokens]
x_val_seq = [[word_index_mapping[token] for token in tokens if token in word_index_mapping] for tokens in x_val_tokens]

In [None]:
 max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in x_val_seq))

padding step

In [None]:
def pad_sequences(sequences, max_length):
    """
    Pad sequences to the same length.

    Parameters:
    - sequences: A list of lists where each inner list is a sequence of integers.
    - max_length: The desired maximum length of each padded sequence.

    Returns:
    - padded_sequences: A numpy array of sequences padded to the same maximum length.
    """
    # Initialize a numpy array with zeros of shape (number of sequences, max_length)
    padded_sequences = np.zeros((len(sequences), max_length), dtype=int)

    for idx, sequence in enumerate(sequences):
        sequence_length = len(sequence)
        if sequence_length >= max_length:
            # If the sequence is longer than max_length, truncate it
            padded_sequences[idx, :] = sequence[:max_length]
        else:
            # If the sequence is shorter, pad it with zeros at the beginning
            padded_sequences[idx, -sequence_length:] = sequence

    return padded_sequences

In [None]:
x_train = pad_sequences(X_train_seq,max_length)

In [None]:
x_val = pad_sequences(X_val_seq,max_length)

In [None]:
vocabulary_size = len(word_index_mapping) + 1
# Dimension of the GloVe word vectors.
embedding_dimension = 100

# Initialize the embedding matrix with zeros.
# Each row corresponds to a word vector for each word in the vocabulary.
embedding_matrix = np.zeros((vocabulary_size, embedding_dimension))

# Populate the embedding matrix with GloVe vectors.
# If a word in our vocabulary is not found in GloVe, its vector is left as zeros.
for word, index in word_index_mapping.items():
  glove_vector = glove_embeddings.get(word)
  if glove_vector is not None:
    embedding_matrix[index] = glove_vector

In [None]:
class SentimentAnalyzerLSTM(nn.Module):
    def __init__(self, vocabulary_size, embedding_dimension, lstm_hidden_dimension, output_dimension, pretrained_embedding_matrix):
        super(SentimentAnalyzerLSTM, self).__init__()

        # Store hyperparameters
        self.vocabulary_size = vocabulary_size
        self.embedding_dimension = embedding_dimension
        self.lstm_hidden_dimension = lstm_hidden_dimension
        self.output_dimension = output_dimension

        # Embedding layer with pre-trained embeddings
        self.embedding_layer = nn.Embedding(vocabulary_size, embedding_dimension)
        self.embedding_layer.weight = nn.Parameter(torch.tensor(pretrained_embedding_matrix, dtype=torch.float32))
        self.embedding_layer.weight.requires_grad = False  # Keep the embeddings fixed

        # LSTM layer configured to expect input batch first
        self.lstm_layer = nn.LSTM(embedding_dimension, lstm_hidden_dimension, batch_first=True)

        # Fully connected layer to produce the output
        self.output_layer = nn.Linear(lstm_hidden_dimension, output_dimension)
        self.dropout = nn.Dropout(p=0.5)


    def forward(self, input_text_tensor):
        # Convert input text to embeddings
        text_embeddings = self.embedding_layer(input_text_tensor)

        # Process the embeddings through the LSTM layer
        _, (hidden_state, _) = self.lstm_layer(text_embeddings)

        # Use the last hidden state as input to the fully connected layer
        final_output = self.output_layer(hidden_state[-1])

        return final_output

In [None]:
lstm_model = SentimentAnalyzerLSTM(vocabulary_size, embedding_dimension, lstm_hidden_dimension=128, output_dimension=3, pretrained_embedding_matrix=embedding_matrix)
label_encoder = LabelEncoder()
y_train_encode = label_encoder.fit_transform(y_train)
y_dev_encode= label_encoder.transform(y_val)

In [None]:
x_train_tensor = torch.tensor(x_train, dtype = torch.long)
y_train_tensor = torch.tensor(y_train_encode, dtype = torch.long)
x_dev_tensor = torch.tensor(x_val, dtype = torch.long)
y_dev_tensor = torch.tensor(y_dev_encode, dtype = torch.long)


In [None]:
train_tensor_data = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_tensor_data, batch_size=32, shuffle=True)

In [None]:
#loss functions
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

In [None]:
num_epochs = 10
lstm_model.train()

SentimentAnalyzerLSTM(
  (embedding_layer): Embedding(48654, 100)
  (lstm_layer): LSTM(100, 128, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
for epoch in range(num_epochs):
  for inputs, labels in train_loader:
    optimizer.zero_grad()
    output = lstm_model(inputs)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
  print(f'Epoch {epoch+1}, Loss: {loss.item()}')

lstm_model.eval()


Epoch 1, Loss: 0.9494735598564148
Epoch 2, Loss: 0.5372791290283203
Epoch 3, Loss: 1.2389754056930542
Epoch 4, Loss: 0.6985536217689514
Epoch 5, Loss: 0.6781131625175476
Epoch 6, Loss: 1.0243041515350342
Epoch 7, Loss: 0.689394474029541
Epoch 8, Loss: 0.951617419719696
Epoch 9, Loss: 0.24189789593219757
Epoch 10, Loss: 0.28844964504241943


SentimentAnalyzerLSTM(
  (embedding_layer): Embedding(48654, 100)
  (lstm_layer): LSTM(100, 128, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
with torch.no_grad():
    predictions_dev_lstm = lstm_model(x_dev_tensor_dev_tensor)
    _, predicted_labels = torch.max(predictions_dev_lstm, 1)
    print(classification_report(y_dev_tensor.numpy(), predicted_labels.numpy(), target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.65      0.44      0.53       378
     neutral       0.62      0.68      0.65       919
    positive       0.65      0.69      0.67       703

    accuracy                           0.64      2000
   macro avg       0.64      0.60      0.62      2000
weighted avg       0.64      0.64      0.63      2000

