### Text Preprocessing

Load covid tweets 

In [None]:
import pandas as pd
import numpy as np
import re

tweets_covid = pd.read_csv("covid19_tweets.csv")
tweets_covid.head()

### Data Preparation
To clean the data we need to remove the **links**, **punctuation**, **numbers**, **emojis**, and **stop words**. We will utilize nltk's english stopword and wordnet databases to filter out unwanted words and then normalize the rest.

In [None]:
import nltk
from nltk.corpus import stopwords

# Download nltk's databases
nltk.download('all')

def remove_stopwords(txt):
    """Remove stopwords from the input text

    Args:
        txt (str): the input text to filter

    Returns:
        str: the filtered text, with all stopwords removed
    """
    words = txt.lower().split()
    non_stopwords = [word for word in words if word not in stop_words]
    non_stopwords = ' '.join(non_stopwords)
    return non_stopwords

# Create a hash-set containing all stopwords
# which automatically guarantees word uniqueness
stop_words = set(stopwords.words('english'))

# Filter out links
tweets_covid['clean_text'] = tweets_covid['text'].apply(lambda s: ' '.join(re.sub("(w+://S+)", " ", s).split()))
# Filter out punctuation
tweets_covid["clean_text"] = tweets_covid["clean_text"].apply(lambda s: ' '.join(re.sub("[.,!?:;-='...@#_]", " ", s).split()))
# Filter out numerical values
tweets_covid["clean_text"] = tweets_covid["clean_text"].apply(lambda s: ' '.join(re.sub("\d", "", s).split()))
# Filter out emojis, first turn into ascii and then back to utf
tweets_covid["clean_text"] = tweets_covid["clean_text"].apply(lambda s: s.encode('ascii', 'ignore').decode('ascii'))
# Filter out stopwords
tweets_covid["clean_text"] = tweets_covid["clean_text"].apply(lambda s: remove_stopwords(s))
# Print sample output
tweets_covid[['text', 'clean_text']]

Tokenize the clean text

In [None]:
# Split each line on every whitespace
tweets_covid['clean_text'] = tweets_covid['clean_text'].apply(lambda s: s.split())
tweets_covid[['text', 'clean_text']]

### Text Normalization

At this stage we want to convert words to their base form. This will produce the root form of all words, which will help our models later on during training and inference.

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatiser = WordNetLemmatizer()
# Apply text normalization
tweets_covid['clean_text'] = tweets_covid['clean_text'].apply(lambda tokens: [lemmatiser.lemmatize(token, pos='v') for token in tokens])
tweets_covid[['text', 'clean_text']]