### Import libraries

In [None]:
import numpy as np
from collections import defaultdict, Counter
from scipy.sparse import lil_matrix
import pandas as pd
import nltk
from nltk.corpus import stopwords
import emoji
import re
import string

### Data pre-processing

In [None]:
class SentimentPreprocessor:
    def __init__(self, dataset_path):
        self.df = pd.read_csv(dataset_path)
        self.chat_words = {
            "AFAIK": "As Far As I Know",
            "AFK": "Away From Keyboard",
            "ASAP": "As Soon As Possible",
            "ATK": "At The Keyboard",
            "ATM": "At The Moment",
            "A3": "Anytime, Anywhere, Anyplace",
            "BAK": "Back At Keyboard",
            "BBL": "Be Back Later",
            "BBS": "Be Back Soon",
            "BFN": "Bye For Now",
            "B4N": "Bye For Now",
            "BRB": "Be Right Back",
            "BRT": "Be Right There",
            "BTW": "By The Way",
            "B4": "Before",
            "CU": "See You",
            "CUL8R": "See You Later",
            "CYA": "See You",
            "FAQ": "Frequently Asked Questions",
            "FC": "Fingers Crossed",
            "FWIW": "For What It's Worth",
            "FYI": "For Your Information",
            "GAL": "Get A Life",
            "GG": "Good Game",
            "GN": "Good Night",
            "GMTA": "Great Minds Think Alike",
            "GR8": "Great!",
            "IC": "I See",
            "ICQ": "I Seek you (also a chat program)",
            "ILU": "ILU: I Love You",
            "IMHO": "In My Honest/Humble Opinion",
            "IMO": "In My Opinion",
            "IOW": "In Other Words",
            "IRL": "In Real Life",
            "KISS": "Keep It Simple, Stupid",
            "LDR": "Long Distance Relationship",
            "LMAO": "Laugh My A.. Off",
            "LOL": "Laughing Out Loud",
            "LTNS": "Long Time No See",
            "L8R": "Later",
            "MTE": "My Thoughts Exactly",
            "M8": "Mate",
            "NRN": "No Reply Necessary",
            "OIC": "Oh I See",
            "PITA": "Pain In The A..",
            "PRT": "Party",
            "PRW": "Parents Are Watching",
            "QPSA?": "Que Pasa?",
            "ROFL": "Rolling On The Floor Laughing",
            "ROFLOL": "Rolling On The Floor Laughing Out Loud",
            "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
            "SK8": "Skate",
            "STATS": "Your sex and age",
            "ASL": "Age, Sex, Location",
            "THX": "Thank You",
            "TTFN": "Ta-Ta For Now!",
            "TTYL": "Talk To You Later",
            "U": "You",
            "U2": "You Too",
            "U4E": "Yours For Ever",
            "WB": "Welcome Back",
            "WTF": "What The F...",
            "WTG": "Way To Go!",
            "WUF": "Where Are You From?",
            "W8": "Wait...",
            "7K": "Sick:-D Laugher",
            "TFW": "That feeling when",
            "MFW": "My face when",
            "MRW": "My reaction when",
            "IFYP": "I feel your pain",
            "TNTL": "Trying not to laugh",
            "JK": "Just kidding",
            "IDC": "I don't care",
            "ILY": "I love you",
            "IMU": "I miss you",
            "ADIH": "Another day in hell",
            "ZZZ": "Sleeping, bored, tired",
            "WYWH": "Wish you were here",
            "TIME": "Tears in my eyes",
            "BAE": "Before anyone else",
            "FIMH": "Forever in my heart",
            "BSAAW": "Big smile and a wink",
            "BWL": "Bursting with laughter",
            "BFF": "Best friends forever",
            "CSL": "Can't stop laughing"
        }

    def preprocess(self):
        print("Starting preprocessing...")
        self.lowercase()
        print("Converted text to lowercase.")
        self.remove_html_tags()
        print("Removed HTML tags.")
        self.remove_url()
        print("Removed URLs.")
        self.remove_punctuation()
        print("Removed punctuation.")
        self.chat_conversion()
        print("Converted chat words.")
        self.remove_stopwords()
        print("Removed stopwords.")
        self.remove_emoji()
        print("Removed emojis.")
        self.tokenize()
        print("Tokenized text.")
        print("Preprocessing complete.")
        return self.df['review']

    def lowercase(self):
        self.df['review'] = self.df['review'].str.lower()

    def remove_html_tags(self):
        pattern = re.compile('<.*?>')
        self.df['review'] = self.df['review'].apply(lambda x: pattern.sub(r'', x))

    def remove_url(self):
        pattern = re.compile(r'https?://\S+|www\.\S+')
        self.df['review'] = self.df['review'].apply(lambda x: pattern.sub(r'', x))

    def remove_punctuation(self):
        punc = string.punctuation
        self.df['review'] = self.df['review'].apply(lambda x: x.translate(str.maketrans('', '', punc)))

    def chat_conversion(self):
        self.df['review'] = self.df['review'].apply(self._chat_conversion_helper)

    def _chat_conversion_helper(self, text):
        new_text = []
        for word in text.split():
            if word.upper() in self.chat_words:
                new_text.append(self.chat_words[word.upper()])
            else:
                new_text.append(word)
        return " ".join(new_text)

    def remove_stopwords(self):
        stopword = stopwords.words('english')
        self.df['review'] = self.df['review'].apply(
            lambda x: " ".join([word for word in x.split() if word not in stopword]))

    def remove_emoji(self):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        self.df['review'] = self.df['review'].apply(lambda x: emoji_pattern.sub(r'', x))

    def tokenize(self):
        self.df['review'] = self.df['review'].apply(word_tokenize)

    def show_before_after(self, sample_size=10 ):
        original_reviews = self.df['review'].head(sample_size).copy()
        preprocessed_reviews = self.preprocess().head(sample_size)
    
        comparison_df = pd.DataFrame({
            'Original Review': original_reviews,
            'Preprocessed Review': preprocessed_reviews
        })
    
        print("Before and After Preprocessing:")
        print(comparison_df)

### Embedding with glove pre-trained model

In [None]:

class GloveEmbeddingLoader:
    def __init__(self, glove_file_path, dataset_path, embedding_dim=200):
        self.glove_file_path = glove_file_path
        self.dataset_path = dataset_path
        self.embedding_dim = embedding_dim
        self.embeddings = None
        self.vocabulary = None
        self.word_to_id = None
        self.id_to_word = None
        self.embedding_matrix = None

    def load_glove_embeddings(self):
        print("Loading GloVe embeddings...")
        self.embeddings = {}
        with open(self.glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                self.embeddings[word] = vector
        print("GloVe embeddings loaded.")

    def preprocess_data(self):
        """Load and preprocess the dataset."""
        print("Starting data preprocessing...")
        preprocessor = SentimentPreprocessor(self.dataset_path)
        preprocessor.preprocess()
        self.corpus = preprocessor.df['review'].tolist()
        print("Data preprocessing complete.")

    def build_vocabulary(self, sentences):
        print("Building vocabulary...")
        word_counts = Counter(word for sentence in sentences for word in sentence)
        self.vocabulary = [word for word, count in word_counts.items() if count >= 5]  # Limit vocabulary size
        self.word_to_id = {word: i for i, word in enumerate(self.vocabulary)}
        self.id_to_word = {i: word for i, word in enumerate(self.vocabulary)}
        print("Vocabulary built.")

    def create_embedding_matrix(self):
        print("Creating embedding matrix...")
        vocab_size = len(self.vocabulary)
        self.embedding_matrix = np.zeros((vocab_size, self.embedding_dim))
        for word, i in self.word_to_id.items():
            if word in self.embeddings:
                self.embedding_matrix[i] = self.embeddings[word]
        print("Embedding matrix created.")

    def save_model(self, save_path):
        """Save the embedding matrix and vocabulary."""
        print("Saving the model...")
        np.save(save_path + '_embeddings.npy', self.embedding_matrix)
        with open(save_path + '_vocab.txt', 'w') as f:
            for word in self.vocabulary:
                f.write(word + '\n')
        print("Model saved.")

    def load_model(self, load_path):
        """Load the embedding matrix and vocabulary."""
        print("Loading the model...")
        self.embedding_matrix = np.load(load_path + '_embeddings.npy')
        with open(load_path + '_vocab.txt', 'r') as f:
            self.vocabulary = [line.strip() for line in f]
        self.word_to_id = {word: i for i, word in enumerate(self.vocabulary)}
        self.id_to_word = {i: word for i, word in enumerate(self.vocabulary)}
        print("Model loaded.")

# Example usage
glove_loader = GloveEmbeddingLoader('glove.twitter.27B/glove.twitter.27B.200d.txt', 'IMDB Dataset.csv')
glove_loader.load_glove_embeddings()
glove_loader.preprocess_data()
glove_loader.build_vocabulary(glove_loader.corpus)
glove_loader.create_embedding_matrix()
glove_loader.save_model('glove_model')
glove_loader.load_model('glove_model')