In [None]:
# Importing libraries to work with
import re
import os
import nltk
import spacy
import torch
import numpy as np
import scipy as sci
import polars as pl
import pandas as pd
import gensim as gns
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# For the sake of Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Word Embedding
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
# Display the libraries' versions used in this notebook
version_list = {"NumPy Version:": np.__version__,
                "Polars Version:": pl.__version__,
                "MatPlotLib Version:": mpl.__version__,
                "Seaborn Version:": sns.__version__,
                "PyTorch Version:": torch.__version__,
                "NLTK Version:": nltk.__version__,
                "SpaCy Version:": spacy.__version__,
                "Gensim Version:": gns.__version__,
                "SciPy Version:": sci.__version__}

for (k, v) in version_list.items():
    print(k, v)

In [None]:
# Defining path to install NLTK libraries in
NLTK_LIB_PATH = ".\\venv_nlp\\Lib\\nltk_data"

# Defining download function
def download_libs():
    # Download extra parts of the library to use
    try:
        nltk.data.find("tokenizers\\punkt.zip")                     # Punctuation
        print("Punctuation Data Exists.")
    except LookupError:
        nltk.download('punkt', download_dir = NLTK_LIB_PATH)

    try:
        nltk.data.find("corpora\\stopwords.zip")                    # Stopwords
        print("Stopwords Package Exists.")
    except LookupError:
        nltk.download('stopwords', download_dir = NLTK_LIB_PATH)

    try:
        nltk.data.find("corpora\\wordnet.zip")                      # Corpus
        print("Wordnet Package Exists.")
    except LookupError:
        nltk.download('wordnet', download_dir = NLTK_LIB_PATH)

try:
    os.mkdir(NLTK_LIB_PATH)

    print("Directory for NLTK Packages Created.. Installing (Hopefully)")
    download_libs()
except FileExistsError:
    print("Directory Exists.")
    download_libs()
except:
    print("Couldn't Make Directory.")
    exit()

# Data Loading
Importing our csv into our workspace.

In [None]:
# Loading dataframe
text_data = pl.read_csv("datasets/twitter_training.csv", has_header=False, new_columns = ["tweet_id", "entity", "sentiment", "tweet_content"])

In [None]:
# Viewing first 10 rows
text_data.head(10)

In [None]:
# Checking for nulls
text_data.describe()

Considering our Null count within the tweet content column is practically $<1\%$ (to be exact $0.927\%$), we can safely drop those rows

In [None]:
# Dropping nulls
text_data = text_data.drop_nulls('tweet_content')
text_data = text_data.drop('tweet_id')

In [None]:
# Viewing the available rows and their information
text_data.describe()

# Preprocessing
Preprocessing the text so it's somewhat cleaner than when obtianed, so that the model doesn't struggle (Instead, we will :D ).

In [None]:
# Create a stopword set
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add("im")

In [None]:
# Define a regex function to remove special characters, links, etc.
def regex_cleanse(text: str):
    # URLS
    text = re.sub(r'https\S+', '', text)

    # @<username>
    text = re.sub(r'@\w+', '', text)

    # #<word>
    text = re.sub(r'#\w+', '', text)

    # One character that doesn't belong to word or whitespace
    text = re.sub(r'[^\w\s]', '', text)

    # Attempt to remove linked pictures URLs
    text = re.sub(r'pic\w+', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords])

    return text

In [None]:
# Tokenisation Function
def tokeniser(text):
    return word_tokenize(text)

In [None]:
# Loading language model
model = spacy.load('en_core_web_sm')

# Lemmatiser
def lemma(tokens):
    doc = model(tokens)
    return [token.lemma_ for token in doc]

In [None]:
# Removing emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
# Define a text preprocessing function to apply to all rows
def preprocess_text(text: str) -> list[str]:
    text = regex_cleanse(text.lower())
    text = remove_emoji(text)
    text = lemma(text)
    #text = tokeniser(text)
    return text

In [None]:
# Creating a cleaned-preprocessed dataset
cleaned = text_data.with_columns(pl.col('tweet_content').map_elements(preprocess_text, return_dtype = list[str]))

In [None]:
# Viewing
cleaned.head()

# Word Embedding
Word Embedding

In [None]:
# Creating a series of vectors from every sentence
model_ready_text = cleaned['tweet_content'].to_list()

model_ready_text[0]

In [None]:
# Training a Continuous Bag of Words, Word Embedding
word2vec_cbow = Word2Vec(model_ready_text, min_count = 5, vector_size=100,  sg = 0, workers = 10)

In [None]:
# Viewing the vector for the word
word2vec_cbow.wv['nvidia']

In [None]:
# Training a Skip-gram Word Embedding
word2vec_skgrm = Word2Vec(model_ready_text, min_count = 5, vector_size = 100, sg = 1, workers = 10)

In [None]:
word2vec_skgrm.wv['nvidia']