In [94]:
# Importing libraries to work with
import re
import os
import nltk
import spacy
import torch
import string
import scipy as sci
import numpy as np
import polars as pl
import gensim as gns
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [95]:
# Display the libraries' versions used in this notebook
version_list = {"NumPy Version:": np.__version__,
                "Polars Version:": pl.__version__,
                "MatPlotLib Version:": mpl.__version__,
                "Seaborn Version:": sns.__version__,
                "PyTorch Version:": torch.__version__,
                "NLTK Version:": nltk.__version__,
                "SpaCy Version:": spacy.__version__,
                "Gensim Version:": gns.__version__,
                "SciPy Version:": sci.__version__}

for (k, v) in version_list.items():
    print(k, v)

NumPy Version: 1.26.4
Polars Version: 1.12.0
MatPlotLib Version: 3.9.2
Seaborn Version: 0.13.2
PyTorch Version: 2.5.1+cpu
NLTK Version: 3.9.1
SpaCy Version: 3.8.2
Gensim Version: 4.3.3
SciPy Version: 1.13.1


In [96]:
# Defining path to install NLTK libraries in
NLTK_LIB_PATH = ".\\venv_nlp\\Lib\\nltk_data"

try:
    os.mkdir(NLTK_LIB_PATH)

    print("Directory for NLTK Packages Created.. Installing (Hopefully)")

    # Download extra parts of the library to use
    try:
        nltk.data.find("tokenizers\\punkt.zip")                     # Punctuation
        print("Punctuation Data Exists.")
    except LookupError:
        nltk.download('punkt', download_dir = NLTK_LIB_PATH)

    try:
        nltk.data.find("corpora\\stopwords.zip")                    # Stopwords
        print("Stopwords Package Exists.")
    except LookupError:
        nltk.download('stopwords', download_dir = NLTK_LIB_PATH)

    try:
        nltk.data.find("corpora\\wordnet.zip")                      # Corpus
        print("Wordnet Package Exists.")
    except LookupError:
        nltk.download('wordnet', download_dir = NLTK_LIB_PATH)
except FileExistsError:
    print("Directory Exists.")
except:
    print("Couldn't Make Directory.")

Directory Exists.


# Data Loading
Importing our csv into our workspace.

In [97]:
# Loading dataframe
text_data = pl.read_csv("datasets/twitter_training.csv", has_header=False, new_columns = ["tweet_id", "entity", "sentiment", "tweet_content"])

In [98]:
# Viewing first 10 rows
text_data.head(10)

tweet_id,entity,sentiment,tweet_content
i64,str,str,str
2401,"""Borderlands""","""Positive""","""im getting on borderlands and …"
2401,"""Borderlands""","""Positive""","""I am coming to the borders and…"
2401,"""Borderlands""","""Positive""","""im getting on borderlands and …"
2401,"""Borderlands""","""Positive""","""im coming on borderlands and i…"
2401,"""Borderlands""","""Positive""","""im getting on borderlands 2 an…"
2401,"""Borderlands""","""Positive""","""im getting into borderlands an…"
2402,"""Borderlands""","""Positive""","""So I spent a few hours making …"
2402,"""Borderlands""","""Positive""","""So I spent a couple of hours d…"
2402,"""Borderlands""","""Positive""","""So I spent a few hours doing s…"
2402,"""Borderlands""","""Positive""","""So I spent a few hours making …"


In [99]:
# Checking for nulls
text_data.describe()

statistic,tweet_id,entity,sentiment,tweet_content
str,f64,str,str,str
"""count""",74682.0,"""74682""","""74682""","""73996"""
"""null_count""",0.0,"""0""","""0""","""686"""
"""mean""",6432.586165,,,
"""std""",3740.42787,,,
"""min""",1.0,"""Amazon""","""Irrelevant""",""" """
"""25%""",3195.0,,,
"""50%""",6422.0,,,
"""75%""",9601.0,,,
"""max""",13200.0,"""johnson&johnson""","""Positive""","""🧻 at Home Depot on Hanley... I…"


Considering our Null count within the tweet content column is practically $<1\%$ (to be exact $0.927\%$), we can safely drop those rows

In [100]:
text_data = text_data.drop_nulls('tweet_content')

In [101]:
text_data.describe()

statistic,tweet_id,entity,sentiment,tweet_content
str,f64,str,str,str
"""count""",73996.0,"""73996""","""73996""","""73996"""
"""null_count""",0.0,"""0""","""0""","""0"""
"""mean""",6430.279231,,,
"""std""",3737.660027,,,
"""min""",1.0,"""Amazon""","""Irrelevant""",""" """
"""25%""",3194.0,,,
"""50%""",6418.0,,,
"""75%""",9595.0,,,
"""max""",13200.0,"""johnson&johnson""","""Positive""","""🧻 at Home Depot on Hanley... I…"


In [102]:
# Create a stopword set
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add('im')

In [103]:
# Define a text preprocessing function to apply to all rows
def preprocess_text(text: str):
    text = text.lower()
    text = re.sub(r'https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords])

    return text

In [104]:
cleaned = text_data.with_columns(pl.col('tweet_content').map_elements(preprocess_text, return_dtype = str))

In [105]:
cleaned.head(10)

tweet_id,entity,sentiment,tweet_content
i64,str,str,str
2401,"""Borderlands""","""Positive""","""getting borderlands murder"""
2401,"""Borderlands""","""Positive""","""coming borders kill"""
2401,"""Borderlands""","""Positive""","""getting borderlands kill"""
2401,"""Borderlands""","""Positive""","""coming borderlands murder"""
2401,"""Borderlands""","""Positive""","""getting borderlands 2 murder"""
2401,"""Borderlands""","""Positive""","""getting borderlands murder"""
2402,"""Borderlands""","""Positive""","""spent hours making something f…"
2402,"""Borderlands""","""Positive""","""spent couple hours something f…"
2402,"""Borderlands""","""Positive""","""spent hours something fun dont…"
2402,"""Borderlands""","""Positive""","""spent hours making something f…"
