In [176]:
!pip install swifter
!pip install tqdm
!pip install emoji
!pip install pyspellchecker
!pip install spacy
!pip install symspellpy
!python -m spacy download en_core_web_sm

Collecting symspellpy
  Downloading symspellpy-6.7.8-py3-none-any.whl.metadata (3.9 kB)
Collecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading symspellpy-6.7.8-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading editdistpy-0.1.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.1/144.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: editdistpy, symspellpy
Successfully installed editdistpy-0.1.5 symspellpy-6.7.8
Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

# **Pytorch Sentiment Analysis**

## Import

In [None]:
import pandas as pd
import kagglehub
import re
from tqdm import tqdm
import swifter
import emoji
from spellchecker import SpellChecker
from transformers import pipeline
import string
import spacy
from transformers import AutoTokenizer

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

tqdm.pandas()

In [None]:
# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


In [None]:
!mkdir data
!mv /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2 data

mkdir: cannot create directory ‘data’: File exists
mv: cannot move '/root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2' to 'data/2': Directory not empty


In [None]:
df_train = pd.read_csv("data/2/twitter_training.csv",)
df_test = pd.read_csv("data/2/twitter_validation.csv")

The columns of the dataframe is actually one dataframe row for test and val, the next function aims at formatting the table dynamically ensuring proper naming of columns and adding the row to the df

In [None]:
# Fixing Columns Name, adding new row ...
def format_table(df):
  new_row = list(df.columns) # Row to be added, mixed up with columns
  columns = list(df.columns) # Name of current columns
  rename_col_dict={}
  for i in range(len(columns)):
    if i==0:
      rename_col_dict[columns[i]] = "twitter_id"
    elif i == 1:
      rename_col_dict[columns[i]] = "category"
    elif i == 2:
      rename_col_dict[columns[i]] = "label"
    elif i == 3:
      rename_col_dict[columns[i]] = "text"
  df = df.rename(columns=rename_col_dict)
  df.loc[len(df)] = new_row
  return df

df_train = format_table(df_train)
df_test = format_table(df_test)

## **Preprocessing Steps**

- Preprocessing step is an essential step for machine learning as it allow us to clean our data, and prepare it for training.

- The steps varies depending on the data we are dealing with, NLP, Computer Vision, classification, continuous, categorical data ...

- Each type of data has it's own set of preprocessing step and they can vary depending on the data we have in front of us

## *Missing Values*



In [None]:
df_train.isnull().sum()

Unnamed: 0,0
twitter_id,0
category,0
label,0
text,686


In [None]:
df_test.isnull().sum()

Unnamed: 0,0
twitter_id,0
category,0
label,0
text,0


### Small analysis of missing values

In [None]:
df_train

Unnamed: 0,twitter_id,category,label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...
74680,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [None]:
df_train_na = df_train[df_train.isnull().any(axis=1)]

In [None]:
df_train_na

Unnamed: 0,twitter_id,category,label,text
60,2411,Borderlands,Neutral,
552,2496,Borderlands,Neutral,
588,2503,Borderlands,Neutral,
744,2532,Borderlands,Positive,
1104,2595,Borderlands,Positive,
...,...,...,...,...
73971,9073,Nvidia,Positive,
73972,9073,Nvidia,Positive,
74420,9154,Nvidia,Positive,
74421,9154,Nvidia,Positive,


The text is missing for whatever reason, so these rows cannot be used for training since they contain no data, so we remove them

In [None]:
def drop_missing_rows(df):
  nulls = df.isnull().sum()
  for values in nulls:
    if values > 0:
      print("Null Values Found, dropping rows")
      df = df.dropna()
      df = df.reset_index(drop=True)
    else:
      print("No Null Values found")
  print(" ")
  return df


In [None]:
df_train.isnull().sum()

Unnamed: 0,0
twitter_id,0
category,0
label,0
text,686


In [None]:
df_test.isnull().sum()

Unnamed: 0,0
twitter_id,0
category,0
label,0
text,0


We can see that the rows with missing values has been drop

## *Lower text*

Lowering the text is a sort of normalization, so everything is in the same form, tokenizers are based on dictionary and **'Hello' and 'hello' are different for a computer/model/tokenizer**

In [None]:
def lower_text(df):
  df["text"] = df["text"].str.lower()
  return df


## *Handeling Contraction*

Handeling contraction, for the same reason as lowering text, **Im, I am and I'am are the same thing for us not for the model**, this will allow us to speed up the training adn improve accuracy

In [None]:
def handle_english_contractions(text):
    """
    Expands English contractions in the text using regex for strict matching.

    Args:
        text (str): The English text with contractions.

    Returns:
        str: The text with expanded contractions.
    """
    contractions = {
    # Pronoun + Verb Contractions
    r"\bi'm\b": "i am",
    r"\bi'm\b": "i am",
    r"\bim\b": "i am",
    r"\bi've\b": "i have",
    r"\bi'll\b": "i will",
    r"\bi'd\b": "i would",
    r"\byou're\b": "you are",
    r"\byou've\b": "you have",
    r"\byou'll\b": "you will",
    r"\byou'd\b": "you would",
    r"\bhe's\b": "he is",
    r"\bhe'll\b": "he will",
    r"\bhe'd\b": "he would",
    r"\bshe's\b": "she is",
    r"\bshe'll\b": "she will",
    r"\bshe'd\b": "she would",
    r"\bit's\b": "it is",
    r"\bit'll\b": "it will",
    r"\bit'd\b": "it would",
    r"\bwe're\b": "we are",
    r"\bwe've\b": "we have",
    r"\bwe'll\b": "we will",
    r"\bwe'd\b": "we would",
    r"\bthey're\b": "they are",
    r"\bthey've\b": "they have",
    r"\bthey'll\b": "they will",
    r"\bthey'd\b": "they would",

    # Negative Contractions
    r"\bisn't\b": "is not",
    r"\baren't\b": "are not",
    r"\bwasn't\b": "was not",
    r"\bweren't\b": "were not",
    r"\bcan't\b": "cannot",
    r"\bcannot\b": "cannot",
    r"\bdon't\b": "do not",
    r"\bdidn't\b": "did not",
    r"\bdoesn't\b": "does not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bshouldn't\b": "should not",
    r"\bcouldn't\b": "could not",
    r"\bmustn't\b": "must not",
    r"\bshan't\b": "shall not",
    r"\bain't\b": "is not",

    # Informal and Colloquial Contractions
    r"\bgonna\b": "going to",
    r"\bwanna\b": "want to",
    r"\bgimme\b": "give me",
    r"\blemme\b": "let me",
    r"\bkinda\b": "kind of",
    r"\bsorta\b": "sort of",
    r"\bgotta\b": "got to",
    r"\by'all\b": "you all",
    r"\bdunno\b": "do not know",
    r"\bo'er\b": "over",
    r"\bou\b": "out",

    # Miscellaneous Contractions
    r"\bo'clock\b": "of the clock",
    r"\bma'am\b": "madam",
    r"\blet's\b": "let us",
    r"\bthat's\b": "that is",
    r"\bthere's\b": "there is",
    r"\bwhat's\b": "what is",
    r"\bwhere's\b": "where is",
    r"\bwho's\b": "who is",
    r"\bhow's\b": "how is",
    r"\bhere's\b": "here is",
    r"\bs'pose\b": "suppose",
    r"\bc'mon\b": "come on",
    r"\by'know\b": "you know",

    # Slang Abbreviations
    r"\bcuz\b": "because",
    r"\bbro\b": "brother",
    r"\bsis\b": "sister",
    r"\bidk\b": "i do not know",
    r"\bikr\b": "i know right",
    r"\btho\b": "though",
    r"\btil\b": "until",
    r"\bomg\b": "oh my god",
    r"\blol\b": "laugh out loud",
    r"\bbrb\b": "be right back",
    r"\bttyl\b": "talk to you later",
    r"\bsmh\b": "shaking my head"
}

    for contraction, expanded in contractions.items():
        text = re.sub(contraction, expanded, text)
    return text


## *Handeling emojis*

- First, I want to check how many sentences contains emoji, this will be used as a baseline to see if the emojis have been correcctly removed or not

- Emojis can be used in training because they are encoded as UNICODE characters, but we will have to link each unicode/emoji to a word that best describes, and create e;bedding for it, so treat the emojis as a word that makes sense

In [None]:
# Function to check if a string contains emojis
def contains_emoji(text):
    return bool(emoji.emoji_list(text))  # Returns True if emojis are found

# Filter rows containing emojis


In [None]:
# Function to remove emojis
def remove_emojis(text):
    return emoji.replace_emoji(text, "")  # Replace emojis with an empty string

## *URLs, mentions and Hashtag and special charac*

- Remove hashtag mentions and urls since they don't have a meaning when working with sentiment analysis

- Our goal is to find if the comment is good or bad, objects mentioned above aren't really useful and can make the model more complexe for no reason

In [None]:
def clean_tweet(text):
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)           # Remove mentions
    text = re.sub(r"#", "", text)              # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)        # Remove Special Characters
    return text.strip()



## *Punctuation*





Punctuations are concidered as token, and they are not really useful for sentiment analysis so we can safely remove them

In [None]:
def remove_punctuation(text):
  text.translate(str.maketrans('', '',
                               string.punctuation))
  return text


In [None]:
df_train.iloc[0,-1]

'I am coming to the borders and I will kill you all,'

In [None]:
def remove_stopwords_spacy(text):
    doc = nlp(text)
    # Keep tokens that are not stopwords and not punctuation
    filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(filtered_tokens)

# Example usage


## *Spelling mistakes*

There are many ways to correct spelling mistakes, I will list the most common ones and describe the one I will be using:
- **Dictionary Rule Base**, use a dictionary to check if each word is spelled correctly
- **Edit distance**, see how many "deletion", "insertation", "updates" are needed for a mistaken word to be correct, the less operation it needs to form a word means this is the word we need
- **Phonetic**, we convert the word to "sound" and see to which sound/word it is closest too
- **Language Model**: we can either use statistical language model like MLM (Mask Language Model) or neural language model like chatGPT

I will implement the statistical language model technique, it follows these steps:
1. Check each of word of each sentence against a dictionary, and replace each words not in the dictionary with [MASK]
2.



In [None]:
def get_prediction(predictions):
  best_prediction=""
  predictions = list(predictions)
  predictions = predictions[-1]
  if type(predictions) == list:
    predictions = predictions[0]
    predictions = predictions["token_str"]
  else:
    predictions = predictions["token_str"]

  if predictions in string.punctuation:
    predictions = ""
  return predictions


In [142]:
from transformers import pipeline
from spellchecker import SpellChecker

# Initialize BERT MLM pipeline and SpellChecker
fill_mask = pipeline("fill-mask", model="bert-base-uncased", device="cuda")
spell = SpellChecker()

def correct_with_mlm(text):
    # Tokenize the sentence into words
    words = text.split()
    misspelled_words = spell.unknown(words)  # Identify misspelled words

    # If no misspelled words, return the original text
    if not misspelled_words:
        return text

    # Replace misspelled words with [MASK]
    masked_sentence = " ".join("[MASK]" if word in misspelled_words else word for word in words)

    # Use the MLM to predict replacements for all [MASK] tokens
    replacements = []
    for _ in misspelled_words:
        # Get MLM predictions for the first [MASK] in the sentence
        predictions = fill_mask(masked_sentence)
        # Extract the top prediction
        replacement = predictions[0]['token_str']
        replacements.append(replacement)
        # Replace only the first [MASK] with the predicted word
        masked_sentence = masked_sentence.replace("[MASK]", replacement, 1)


    # Return the corrected sentence
    return masked_sentence



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [182]:
from symspellpy import SymSpell, Verbosity

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load a dictionary (prebuilt or custom)
# Download one from: https://github.com/wolfgarbe/SymSpell/tree/master/SymSpell/FrequencyDictionary
dictionary_path = "frequency_dictionary_en_82_765.txt"  # Path to your dictionary
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def correct_spelling(text):
    words = text.split()
    corrected_words = []

    for word in words:
        # Get the most likely correction for each word
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_words.append(suggestions[0].term)  # Use the top suggestion
        else:
            corrected_words.append(word)  # Keep the original if no suggestion is found

    return " ".join(corrected_words)


## **Feature Engineering**

It involves creating, selecting, or modifying features based on domain knowledge and data characteristics to highlight patterns or relationships relevant to the task.

- Creating new column
- Tokenizing
- Encoding Labels



**We have a column category, we can merge it with our text column to give more context to our data**

In [None]:
df_train.head()

Unnamed: 0,twitter_id,category,label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [None]:
df_train

Unnamed: 0,twitter_id,category,label,text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...
74680,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [None]:
def merge_cat_text(row):
  text = row["text"]
  category = row["category"].lower()
  if  category not in text.split(" "):
    row["text"] = category + " " + text
  return row

In [None]:
def find_max_length(df):
  max_length = df["text"].apply(len).max()
  return max_length

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_data(text, max_length=0,mode="train"):
  if mode=="train":
      tokens = tokenizer(text, truncation=True, padding=True)
      return tokens["input_ids"], tokens["attention_mask"]
  else:
    tokens = tokenizer(text, truncation=True, padding=True, max_length=max_length)
    return tokens["input_ids"], tokens["attention_mask"]



In [None]:
dict_target = {
    "Positive":0,
    "Neutral":1,
    "Negative":2,
    "Irrelevant":3
}

def encode_target(df):
  for key,values in dict_target.items():
    df["label"]=df["label"].replace(key, values)
  return df



In [None]:
def add_padding(max_length,tokens, pad_value=0):
  length_token = len(tokens)
  if length_token < max_length:
    padding_to_add = max_length - length_token
    for i in range(length_token,max_length):
      tokens.append(pad_value)
  return tokens

## **Pipeline Data/training/testing**

- Pipeline will streamline the process , the data will pass through each steps above.

- The pipeline contains the previous function, lower, punctuation removal, spellchecking ...

- Feature Engineering such as encoding and tokenizer

- For inference we need to create another minimized version of this pipeline


In [190]:
def preprocess_pipeline(df, mode, max_length = 0):
    if not mode=="train":
      print(f"\n\033[1;34m=== Starting Preprocessing Pipeline: Mode = {mode.upper()} ===\033[0m")

      # Step 1: Dropping missing rows
      print("\n\033[1;32m> Dropping missing rows...\033[0m")
      initial_rows = len(df)
      df = drop_missing_rows(df)
      print(f"  - Dropped {initial_rows - len(df)} rows with missing values. Remaining rows: {len(df)}")

      # Step 2: Lowercasing text
      print("\n\033[1;32m> Converting text to lowercase...\033[0m")
      df = lower_text(df)

      # Step 3: Handling English contractions
      print("\n\033[1;32m> Handling English contractions...\033[0m")
      df["text"] = df["text"].swifter.apply(lambda row: handle_english_contractions(row))

      # Step 4: Checking and removing emojis
      print("\n\033[1;32m> Checking and removing emojis...\033[0m")
      emoji_count_before = len(df[df['text'].swifter.apply(contains_emoji)])
      print(f"  - Number of rows with emojis before removal: {emoji_count_before}")
      df['text'] = df['text'].swifter.apply(remove_emojis)
      emoji_count_after = len(df[df['text'].swifter.apply(contains_emoji)])
      print(f"  - Number of rows with emojis after removal: {emoji_count_after}")

      # Step 5: Cleaning text (e.g., tweet cleaning)
      print("\n\033[1;32m> Cleaning text...\033[0m")
      df["text"] = df["text"].swifter.apply(clean_tweet)
      df["text"] = df["text"].replace("  ", " ")
      print("  - Text cleaned successfully.")

      # Step 6: Removing punctuation and stopwords
      print("\n\033[1;32m> Removing punctuation and stopwords...\033[0m")
      df["text"] = df["text"].swifter.apply(lambda row: remove_punctuation(row))
      df["text"] = df["text"].swifter.apply(remove_stopwords_spacy)

    # Mode-specific preprocessing
    if mode == "train":
        print("\033[1;36m> Mode: TRAIN - Loading preprocessed training data for testing purposes.\033[0m")
        df = pd.read_csv("process/sa_data_train.csv")
    else:
        print("\033[1;36m> Mode: VAL - Correcting text with MLM...\033[0m")
        df["text"] = df["text"].swifter.apply(lambda row: correct_spelling(row))
        df.to_csv("process/sa_data_val.csv")
        print("\033[1;32m  - Validation data saved to 'process/sa_data_val.csv'.\033[0m")

    # Feature engineering
    print("\n\033[1;34m=== Feature Engineering ===\033[0m")
    df, max_length = feature_engineering(df, mode, max_length)

    # Completion message
    print("\033[1;32m> Preprocessing pipeline completed successfully!\033[0m\n")
    return df, max_length


def feature_engineering(df, mode, max_length):
    """
    Handles feature engineering tasks such as encoding targets, tokenization, and padding.
    """
    print("\033[1;33m> Encoding target labels...\033[0m")
    df = encode_target(df)

    # Drop missing rows post-encoding
    print("\n\033[1;33m> Dropping missing rows after encoding...\033[0m")
    initial_rows = len(df)
    df = drop_missing_rows(df)
    print(f"\033[1;32m  - Dropped {initial_rows - len(df)} rows. Remaining rows: {len(df)}\033[0m")

    # Merge category with text
    print("\n\033[1;33m> Merging category with text...\033[0m")
    print(df)
    df = df.swifter.apply(lambda row: merge_cat_text(row), axis=1)

    # Tokenization process
    print("\n\033[1;33m> Tokenizing text...\033[0m")
    if mode == "train":
        # Tokenize text and calculate max_length for training data
        df[["tokens", "attention_mask"]] = df["text"].swifter.apply(
            lambda row: pd.Series(tokenize_data(row)), axis=1
        )
        print("\033[1;32m  - Tokenization completed for training data.\033[0m")
        max_length = df["tokens"].apply(len).max()
        print(f"\033[1;36m  - Calculated max sequence length: {max_length}\033[0m")

        # Add padding to tokens and attention masks
        print("\033[1;33m> Padding tokens and attention masks...\033[0m")
        df["tokens"] = df["tokens"].apply(lambda tokens: add_padding(max_length, tokens))
        df["attention_mask"] = df["attention_mask"].apply(lambda mask: add_padding(max_length, mask))
        print("\033[1;32m  - Padding completed for training data.\033[0m")

    else:
        # Tokenize text using provided max_length for validation data
        df[["tokens", "attention_mask"]] = df["text"].swifter.apply(
            lambda row: pd.Series(tokenize_data(row, max_length=max_length, mode="val")), axis=1
        )
        print("\033[1;32m  - Tokenization completed for validation data.\033[0m")

    print("\033[1;34m> Feature engineering completed successfully.\033[0m")
    return df, max_length





In [191]:
df_test.head()

Unnamed: 0,twitter_id,category,label,text,tokens,attention_mask
0,352,Amazon,1,news amazon boss rejects claims company act...,"[101, 2739, 9733, 5795, 19164, 4447, 2194, 605...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,8312,Microsoft,2,microsoft pay word functions poorly,"[101, 7513, 3477, 2773, 4972, 9996, 102]","[1, 1, 1, 1, 1, 1, 1]"
2,4371,CS-GO,2,cs-go his matchmaking closet hacking truly awf...,"[101, 20116, 1011, 2175, 2010, 2674, 12614, 93...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,4433,Google,1,google president slapping ##er face commit unl...,"[101, 8224, 2343, 22021, 1001, 1001, 9413, 222...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,6273,FIFA,2,fifa hi of of of cellar past 13 years little s...,"[101, 5713, 7632, 1997, 1997, 1997, 15423, 262...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [192]:
# Example usage
print("\033[1;31m*** Running Preprocessing Pipeline for Training Data ***\033[0m")
df_final, max_length = preprocess_pipeline(df_train, "train")

# print("\033[1;31m*** Running Preprocessing Pipeline for Validation Data ***\033[0m")
# df_test, max_length = preprocess_pipeline(df_test, "val", max_length)

[1;31m*** Running Preprocessing Pipeline for Validation Data ***[0m

[1;34m=== Starting Preprocessing Pipeline: Mode = VAL ===[0m

[1;32m> Dropping missing rows...[0m
No Null Values found
No Null Values found
No Null Values found
No Null Values found
No Null Values found
No Null Values found
 
  - Dropped 0 rows with missing values. Remaining rows: 1000

[1;32m> Converting text to lowercase...[0m

[1;32m> Handling English contractions...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]


[1;32m> Checking and removing emojis...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

  - Number of rows with emojis before removal: 0


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

  - Number of rows with emojis after removal: 0

[1;32m> Cleaning text...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

  - Text cleaned successfully.

[1;32m> Removing punctuation and stopwords...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

[1;36m> Mode: VAL - Correcting text with MLM...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

[1;32m  - Validation data saved to 'process/sa_data_val.csv'.[0m

[1;34m=== Feature Engineering ===[0m
[1;33m> Encoding target labels...[0m

[1;33m> Dropping missing rows after encoding...[0m
No Null Values found
No Null Values found
No Null Values found
No Null Values found
No Null Values found
No Null Values found
 
[1;32m  - Dropped 0 rows. Remaining rows: 1000[0m

[1;33m> Merging category with text...[0m
    twitter_id         category  label  \
0          352           Amazon      1   
1         8312        Microsoft      2   
2         4371            CS-GO      2   
3         4433           Google      1   
4         6273             FIFA      2   
..         ...              ...    ...   
995       4359            CS-GO      3   
996       2652      Borderlands      0   
997       8069        Microsoft      0   
998       6960  johnson&johnson      1   
999       3364         Facebook      3   

                                                  text  \
0    news ama

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]


[1;33m> Tokenizing text...[0m




Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

[1;32m  - Tokenization completed for validation data.[0m
[1;34m> Feature engineering completed successfully.[0m
[1;32m> Preprocessing pipeline completed successfully![0m



## **Training**

After cleaning and manioulating the dataset, it's time to start training:

- Split
- Transform into pytorch format (Data Loader)
- Create Attention mechanism to find out relevent words
- Create model using nn.Module
- Create validation and training loop


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
df_final_train = df_final[["text","tokens","attention_mask","label"]]

## **Split Data**

- There are 3 types of data when training a ML model:
  1. *train*: Used to train the model, the more data the better
  2. *validation*: Used to make sure that the model don't overfit and is training well, if **val_loss increase but train_loss decrease**, this is menas that the model can't generalize well, **overfitting**
  3. *test*: Used to evaluate the model after training based on metrcis


**We already have train/test data now we need validation**

In [None]:
def split_data(df, split_size=0.33):
  df_train, df_val = train_test_split(df,random_state=42,test_size=split_size)
  df_train = df_train.reset_index(drop=True)
  df_val = df_val.reset_index(drop=True)
  return df_train, df_val

df_train, df_val = split_data(df_final_train)

## **Dataset and DataLoader Pytorch**

Pytorch expect us to have our data in a specific format for several reason:

- **Encapuslate the logic** of how our data is transformed (e.g, expend arrays, add dimension, normalize ..)
- **Standard interface** to access the data, our class needs to have three methods,` __init__, __getitem__, __len__`

**DataLoader**:  in PyTorch is a utility that helps efficiently load data from a dataset during model training or evaluation. It is designed to work seamlessly with PyTorch's Dataset class, handling batching, shuffling, and parallel data loading.


In [None]:


class TextDataset(Dataset):

    # Initialize
    def __init__(self, tokens,mask, labels=None, max_length=70, pad_value=0):
        self.tokens = tokens  # List of sentences, each as a list of word IDs
        self.mask = mask
        self.labels = labels  # List of labels, optional

    # Length of data
    def __len__(self):
        return len(self.tokens)

    # Ho to access data
    def __getitem__(self, idx):

        # Tokens
        tokens = self.tokens[idx]

        # Attention Mask
        mask = self.mask[idx]

        # Convert to tensor (pytorch data structure)
        tokens = torch.tensor(tokens, dtype=torch.long)
        mask = torch.tensor(mask, dtype=torch.long)

        # Create dict
        item = {
            "input_ids":tokens,
            "attention_mask": mask
        }

        if self.labels is not None:
            # Add Label/Target to dict
            item["label"] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item


In [None]:
train_dataset = TextDataset(df_train["tokens"],
                            df_train["attention_mask"],
                            df_train["label"])

In [None]:
val_dataset = TextDataset(df_val["tokens"],
                            df_val["attention_mask"],
                            df_val["label"])

## **Attention mehcanism**

The main goal of the attention mechanism, is to apply a degree of importance to words in a sentence, how relevant a word is a sentence.

There are two main types of attention:
1. Additive attention (used now)
2. Transformer attention (used in LLMs and transformers) based on dot product

**Additive** attention also called *Bahdanau style*, is based on a linear layer, and softmax function.
  1. Compute the score using Linear Layer, score is based on each token, and the final output of LSTM which is the "final" representation of the sentence
  2. After computing the score we get the probabilities for the softmax
  3. we compute a weighted sum which is sum((token_vector) * probabilities)
  4. We get a final vector that represent not a contextualzed word in the sentence, but the importance of each word in the sentence

In [None]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        # Linear Layer
        self.attention = nn.Linear(hidden_dim * 2, 1)

    def forward(self, lstm_output, attention_mask):
        # Calculate attention scores
        scores = self.attention(lstm_output).squeeze(-1)  # Shape: (batch_size, seq_length)
        scores = scores.masked_fill(~attention_mask.bool(), -1e9)  # Mask padding
        weights = torch.softmax(scores, dim=1)  # Shape: (batch_size, seq_length)

        # Apply attention weights
        weighted_output = torch.sum(lstm_output * weights.unsqueeze(-1), dim=1)  # Shape: (batch_size, hidden_dim*2)
        return weighted_output

In [None]:
import torch
import torch.nn as nn

class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pad_idx):
        super(SentimentModel, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

        # Bidirectional LSTM
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Attention mechanism
        self.attention = Attention(hidden_dim)

        # Dropout layer
        self.dropout = nn.Dropout(0.3)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        # Embedding
        embedded = self.dropout(self.embedding(input_ids))

        # LSTM
        lstm_output, _ = self.lstm(embedded)  # lstm_output: (batch_size, seq_length, hidden_dim*2)

        # Attention
        weighted_output = self.attention(lstm_output, attention_mask)

        # Dropout and fully connected
        logits = self.fc(self.dropout(weighted_output))

        return logits  # Raw logits


In [None]:

# Hyperparameters
vocab_size = 30522  # Example value
embed_dim = 128
hidden_dim = 256
output_dim = 4  # Positive, Negative, Neutral
pad_idx = 0
max_length = max_length
batch_size = 32
epochs = 35

# Instantiate model
model = SentimentModel(vocab_size, embed_dim, hidden_dim, output_dim, pad_idx)
device = "cuda"
model.to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.00008)
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


# Training and Validation Loop
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    model.train()  # Set model to training mode

    train_loss = 0
    train_correct = 0
    train_total = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)


        # Forward pass (Finding Loss Function)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass (Applying gradient to reduce loss function and modify weights)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulate training loss and accuracy
        train_loss += loss.item()
        predictions = torch.argmax(outputs, dim=1)
        train_correct += (predictions == labels).sum().item()
        train_total += labels.size(0)

    train_accuracy = train_correct / train_total
    print(f"Training Loss: {train_loss / len(train_loader):.4f}, Training Accuracy: {train_accuracy:.4f}")

    # Validation Loop
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            # Accumulate validation loss and accuracy
            val_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            val_correct += (predictions == labels).sum().item()
            val_total += labels.size(0)

    val_accuracy = val_correct / val_total
    print(f"Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}")



Epoch 1/35
Training Loss: 1.2243, Training Accuracy: 0.4452
Validation Loss: 1.1218, Validation Accuracy: 0.5093

Epoch 2/35
Training Loss: 1.1057, Training Accuracy: 0.5216
Validation Loss: 1.0426, Validation Accuracy: 0.5594

Epoch 3/35
Training Loss: 1.0329, Training Accuracy: 0.5636
Validation Loss: 0.9850, Validation Accuracy: 0.5877

Epoch 4/35
Training Loss: 0.9615, Training Accuracy: 0.6023
Validation Loss: 0.9307, Validation Accuracy: 0.6155

Epoch 5/35
Training Loss: 0.9001, Training Accuracy: 0.6324
Validation Loss: 0.8865, Validation Accuracy: 0.6428

Epoch 6/35
Training Loss: 0.8439, Training Accuracy: 0.6581
Validation Loss: 0.8372, Validation Accuracy: 0.6661

Epoch 7/35
Training Loss: 0.7893, Training Accuracy: 0.6862
Validation Loss: 0.7996, Validation Accuracy: 0.6844

Epoch 8/35
Training Loss: 0.7372, Training Accuracy: 0.7101
Validation Loss: 0.7702, Validation Accuracy: 0.6999

Epoch 9/35
Training Loss: 0.6891, Training Accuracy: 0.7305
Validation Loss: 0.7287, Va

In [None]:
df_test, max_length = preprocess_pipeline(df_test, "val", max_length)


[1;34m=== Starting Preprocessing Pipeline: Mode = VAL ===[0m

[1;32m> Dropping missing rows...[0m
No Null Values found
No Null Values found
No Null Values found
No Null Values found
 
  - Dropped 0 rows with missing values. Remaining rows: 1000

[1;32m> Converting text to lowercase...[0m

[1;32m> Handling English contractions...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]


[1;32m> Checking and removing emojis...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

  - Number of rows with emojis before removal: 181


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

  - Number of rows with emojis after removal: 0

[1;32m> Cleaning text...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

  - Text cleaned successfully.

[1;32m> Removing punctuation and stopwords...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

[1;36m> Mode: VAL - Correcting text with MLM...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

[1;32m  - Validation data saved to 'process/sa_data_val.csv'.[0m

[1;34m=== Feature Engineering ===[0m
[1;33m> Encoding target labels...[0m

[1;33m> Dropping missing rows after encoding...[0m
No Null Values found
No Null Values found
No Null Values found
No Null Values found
 
[1;32m  - Dropped 0 rows. Remaining rows: 1000[0m

[1;33m> Merging category with text...[0m


Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]


[1;33m> Tokenizing text...[0m




Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

[1;32m  - Tokenization completed for validation data.[0m
[1;34m> Feature engineering completed successfully.[0m
[1;32m> Preprocessing pipeline completed successfully![0m



### Save the model

We will use the model later for inference and load it

In [None]:
import os

# Define the folder name
folder_name = "model"

# Check if the folder exists, if not, create it
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created.")
else:
    print(f"Folder '{folder_name}' already exists.")

torch.save(model.state_dict(), "model/sa_twitter.pt")

In [None]:
df_test.head()

Unnamed: 0,twitter_id,category,label,text,tokens,attention_mask
0,352,Amazon,1,news amazon boss rejects claims company act...,"[101, 2739, 9733, 5795, 19164, 4447, 2194, 605...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,8312,Microsoft,2,microsoft pay word functions poorly,"[101, 7513, 3477, 2773, 4972, 9996, 102]","[1, 1, 1, 1, 1, 1, 1]"
2,4371,CS-GO,2,cs-go his matchmaking closet hacking truly awf...,"[101, 20116, 1011, 2175, 2010, 2674, 12614, 93...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,4433,Google,1,google president slapping ##er face commit unl...,"[101, 8224, 2343, 22021, 1001, 1001, 9413, 222...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,6273,FIFA,2,fifa hi of of of cellar past 13 years little s...,"[101, 5713, 7632, 1997, 1997, 1997, 15423, 262...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [None]:
target_test = df_test["label"]

In [None]:
input_test = df_test["tokens"]
attention_test = df_test["attention_mask"]

In [201]:
total_correct = 0
max_test_num = len(df_test)
for i in range(0,max_test_num):
  data_index = i
  input = input_test[data_index]
  input = torch.tensor(input)
  attention = attention_test[data_index]
  attention = torch.tensor(attention).unsqueeze(0)

  input = input.to("cuda")
  attention = attention.to("cuda")
  pred = model(input, attention)
  pred = torch.argmax(pred).item()
  target = target_test[data_index]
  if pred - target == 0:
    total_correct+=1

accuracy = total_correct / max_test_num

print(f"Accuracy over {max_test_num} is {accuracy}")

Accuracy over 1000 is 0.936


## **Inference pipeline**

- Different then previous one
- Process a single text and not a whole dataframe
- some functions like feature engineering are not useful since we don't have a category
- Used to make predictions

In [114]:
def decode_target(pred):
  pred = pred.argmax()
  for key, value in dict_target.items():
    if value == pred.item():
      return key

In [185]:
def preprocess_text_for_inference(text, tokenizer, max_length=128):
    """
    Preprocesses a single text for inference.

    Args:
        text (str): The input text to preprocess.
        tokenizer: The tokenizer to use for tokenization (e.g., BERT tokenizer).
        max_length (int): The maximum sequence length for tokenization and padding.

    Returns:
        dict: A dictionary containing the tokenized input IDs and attention mask.
    """
    print("\033[1;34m=== Starting Preprocessing Pipeline for Inference ===\033[0m")




    # Step 1: Lowercasing text
    print("\033[1;32m> Lowercasing text...\033[0m")
    text = text.lower()

    # Step 2: Handling English contractions
    print("\033[1;32m> Handling English contractions...\033[0m")
    text = handle_english_contractions(text)

    # Step 3: Removing emojis
    print("\033[1;32m> Removing emojis...\033[0m")
    text = remove_emojis(text)

    # Step 4: Cleaning text (e.g., tweet cleaning)
    print("\033[1;32m> Cleaning text...\033[0m")
    text = clean_tweet(text)
    text = text.replace("  ", " ")

    # Step 5: Removing punctuation and stopwords
    print("\033[1;32m> Removing punctuation and stopwords...\033[0m")
    text = remove_punctuation(text)
    text = remove_stopwords_spacy(text)

    print("\033[1;36m> Mode: VAL - Correcting text ...\033[0m")
    text = correct_spelling(text)
    print(text)

    # Step 6: Tokenizing and Padding
    print("\033[1;32m> Tokenizing and padding text...\033[0m")
    tokenized_output = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"  # PyTorch tensors
    )
    inputs = tokenized_output["input_ids"].to("cuda")
    attention = tokenized_output["attention_mask"].to("cuda")

    pred = model(inputs, attention)
    pred = decode_target(pred)



    print("\033[1;32m> Preprocessing pipeline completed successfully!\033[0m")
    return pred

In [200]:
text = "Firstly, at a basic level, the output of an LSTM at a particular point in time is dependant on three things"
preprocess_text_for_inference(text, tokenizer)

[1;34m=== Starting Preprocessing Pipeline for Inference ===[0m
[1;32m> Lowercasing text...[0m
[1;32m> Handling English contractions...[0m
[1;32m> Removing emojis...[0m
[1;32m> Cleaning text...[0m
[1;32m> Removing punctuation and stopwords...[0m
[1;36m> Mode: VAL - Correcting text ...[0m
firstly basic level output list particular point time dependant things
[1;32m> Tokenizing and padding text...[0m
[1;32m> Preprocessing pipeline completed successfully![0m


'Neutral'