In [1]:
# Importing libraries to work with
import re
import os
import nltk
import spacy
import torch
import numpy as np
import scipy as sci
import polars as pl
import pandas as pd
import gensim as gns
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# For the sake of Preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Word Embedding
from gensim.models import Word2Vec

In [2]:
# Display the libraries' versions used in this notebook
version_list = {"NumPy Version:": np.__version__,
                "Polars Version:": pl.__version__,
                "MatPlotLib Version:": mpl.__version__,
                "Seaborn Version:": sns.__version__,
                "PyTorch Version:": torch.__version__,
                "NLTK Version:": nltk.__version__,
                "SpaCy Version:": spacy.__version__,
                "Gensim Version:": gns.__version__,
                "SciPy Version:": sci.__version__}

for (k, v) in version_list.items():
    print(k, v)

NumPy Version: 1.26.4
Polars Version: 1.12.0
MatPlotLib Version: 3.9.2
Seaborn Version: 0.13.2
PyTorch Version: 2.5.1+cpu
NLTK Version: 3.9.1
SpaCy Version: 3.8.2
Gensim Version: 4.3.3
SciPy Version: 1.13.1


In [3]:
# Defining path to install NLTK libraries in
NLTK_LIB_PATH = ".\\venv_nlp\\Lib\\nltk_data"

try:
    os.mkdir(NLTK_LIB_PATH)

    print("Directory for NLTK Packages Created.. Installing (Hopefully)")
except FileExistsError:
    print("Directory Exists.")
except:
    print("Couldn't Make Directory.")

# Download extra parts of the library to use
try:
    nltk.data.find("tokenizers\\punkt.zip")                     # Punctuation
    print("Punctuation Data Exists.")
except LookupError:
    nltk.download('punkt', download_dir = NLTK_LIB_PATH)

try:
    nltk.data.find("corpora\\stopwords.zip")                    # Stopwords
    print("Stopwords Package Exists.")
except LookupError:
    nltk.download('stopwords', download_dir = NLTK_LIB_PATH)

try:
    nltk.data.find("corpora\\wordnet.zip")                      # Corpus
    print("Wordnet Package Exists.")
except LookupError:
    nltk.download('wordnet', download_dir = NLTK_LIB_PATH)

Directory Exists.
Punctuation Data Exists.
Stopwords Package Exists.
Wordnet Package Exists.


In [4]:
lemmer = spacy.load('en_core_web_sm')

# Data Loading
Importing our csv into our workspace.

In [5]:
# Loading dataframe
text_data = pl.read_csv("datasets/twitter_training.csv", has_header=False, new_columns = ["tweet_id", "entity", "sentiment", "tweet_content"])

In [6]:
# Viewing first 10 rows
text_data.head(10)

tweet_id,entity,sentiment,tweet_content
i64,str,str,str
2401,"""Borderlands""","""Positive""","""im getting on borderlands and …"
2401,"""Borderlands""","""Positive""","""I am coming to the borders and…"
2401,"""Borderlands""","""Positive""","""im getting on borderlands and …"
2401,"""Borderlands""","""Positive""","""im coming on borderlands and i…"
2401,"""Borderlands""","""Positive""","""im getting on borderlands 2 an…"
2401,"""Borderlands""","""Positive""","""im getting into borderlands an…"
2402,"""Borderlands""","""Positive""","""So I spent a few hours making …"
2402,"""Borderlands""","""Positive""","""So I spent a couple of hours d…"
2402,"""Borderlands""","""Positive""","""So I spent a few hours doing s…"
2402,"""Borderlands""","""Positive""","""So I spent a few hours making …"


In [7]:
# Checking for nulls
text_data.describe()

statistic,tweet_id,entity,sentiment,tweet_content
str,f64,str,str,str
"""count""",74682.0,"""74682""","""74682""","""73996"""
"""null_count""",0.0,"""0""","""0""","""686"""
"""mean""",6432.586165,,,
"""std""",3740.42787,,,
"""min""",1.0,"""Amazon""","""Irrelevant""",""" """
"""25%""",3195.0,,,
"""50%""",6422.0,,,
"""75%""",9601.0,,,
"""max""",13200.0,"""johnson&johnson""","""Positive""","""🧻 at Home Depot on Hanley... I…"


Considering our Null count within the tweet content column is practically $<1\%$ (to be exact $0.927\%$), we can safely drop those rows

In [8]:
text_data = text_data.drop_nulls('tweet_content')
text_data = text_data.drop('tweet_id')

In [9]:
text_data.describe()

statistic,entity,sentiment,tweet_content
str,str,str,str
"""count""","""73996""","""73996""","""73996"""
"""null_count""","""0""","""0""","""0"""
"""mean""",,,
"""std""",,,
"""min""","""Amazon""","""Irrelevant""",""" """
"""25%""",,,
"""50%""",,,
"""75%""",,,
"""max""","""johnson&johnson""","""Positive""","""🧻 at Home Depot on Hanley... I…"


# Preprocessing
Preprocessing the text so it's somewhat cleaner than when obtianed, so that the model doesn't struggle (Instead, we will :D ).

In [10]:
# Create a stopword set
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add("im")

In [11]:
# Define a regex function to remove special characters, links, etc.
def regex_cleanse(text: str):
    text = re.sub(r'https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'pic\w+', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords])

    return text

In [12]:
def tokeniser(text):
    return word_tokenize(text)

In [13]:
model = spacy.load('en_core_web_sm')

def lemma(tokens):
    doc = model(tokens)
    return [token.lemma_ for token in doc]

In [14]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [15]:
# Define a text preprocessing function to apply to all rows
def preprocess_text(text: str) -> list[str]:
    text = regex_cleanse(text.lower())
    text = remove_emoji(text)
    text = lemma(text)
    #text = tokeniser(text)
    return text

In [16]:
txt = preprocess_text("we chungus bungus ungus in this")

print(txt)

['chungus', 'bungus', 'ungus']


In [17]:
cleaned = text_data.with_columns(pl.col('tweet_content').map_elements(preprocess_text, return_dtype = list[str]))

In [18]:
cleaned.head()

entity,sentiment,tweet_content
str,str,list[str]
"""Borderlands""","""Positive""","[""get"", ""borderland"", ""murder""]"
"""Borderlands""","""Positive""","[""come"", ""border"", ""kill""]"
"""Borderlands""","""Positive""","[""get"", ""borderland"", ""kill""]"
"""Borderlands""","""Positive""","[""come"", ""borderland"", ""murder""]"
"""Borderlands""","""Positive""","[""get"", ""borderland"", … ""murder""]"


In [19]:
cleaned.describe()

statistic,entity,sentiment,tweet_content
str,str,str,f64
"""count""","""73996""","""73996""",73996.0
"""null_count""","""0""","""0""",0.0
"""mean""",,,
"""std""",,,
"""min""","""Amazon""","""Irrelevant""",
"""25%""",,,
"""50%""",,,
"""75%""",,,
"""max""","""johnson&johnson""","""Positive""",


# Word Embedding
Word Embedding

In [20]:
# Creating a series of vectors from every sentence
model_ready_text = pd.Series(cleaned['tweet_content'])

model_ready_text.iloc[0]

array(['get', 'borderland', 'murder'], dtype=object)

In [21]:
#word2vec_cbow = Word2Vec(model_ready_text, min_count = 5, vector_size=100,  sg = 0, workers = 10,)