# GCRL2000 - NLP Functions
- This notebook contains various functions for processing tweets

In [1]:
import re
import string

import pandas as pd
import nltk
from nltk import pos_tag
from nltk import TweetTokenizer, WordNetLemmatizer
from nltk.corpus import stopwords

default_stopwords = stopwords.words('english')

default_stopwords.extend(
    list(string.punctuation) + ['would', 'could', 'get',
                                'want', 'he', 'twitter']
)

default_tokeniser = TweetTokenizer()
default_lemmatiser = WordNetLemmatizer()

- `stopwords` contain words to remove from the tweets, e.g., "the"
- `wordnet` is used by the `WordNetLemmatiser`
- `averaged_perceptron_tagger` is used by `pos_tagger()`, it tags a word into nouns, verbs, etc

In [2]:
def init_nltk_downloads() -> None:
    """
    Download the necessary resources for nltk, such as stopwords
    """
    resources = ['corpora/stopwords', 'corpora/wordnet',
                 'taggers/averaged_perceptron_tagger']
    for resource in resources:
        # Find .zip file instead since nltk have problem unzipping files
        try:
            nltk.find(f'{resource}.zip')
        except LookupError:
            nltk.download(resource.split('/')[-1])

- Regex explanation:
    - `r"(@[A-Za-z0-9_]+)"` will match any string, starting with "@", containing letters, numbers, or underscore
    - `r"^http.+?|(\w+:\/\S+)"` match strings starting with "http"

In [3]:
def load_twitter_csv(data_file: str) -> pd.DataFrame:
    """
    Load in a csv file produce by Twitter scraper, return cleaned DataFrame
    """
    df = pd.read_csv(
        data_file,
        index_col=0,
        usecols=['conversation_id', 'created_at', 'user_id', 'tweet',
                 'language']
    )
    # Only consider English tweets, ignore neutral language
    df.query('language == "en"', inplace=True)
    df.drop(columns=['language'], inplace=True)
    df.dropna(subset=['tweet'], inplace=True)

    # Clean tweet texts
    df['tweet'] = \
        df['tweet'].apply(
            lambda txt: re.sub(
                r"(@[A-Za-z0-9_]+)|"        # Match mentions
                r"^http.+?|(\w+:\/\S+)",    # Match urls
                '',
                txt
            ).lower()
        )
    return df

In [4]:
# Remove stopwords and turn word into lemmatised form
def furnish(
        text: str, 
        tokeniser=default_tokeniser, 
        lemmatiser=default_lemmatiser, 
        stop_words: list = default_stopwords
):
    final_text = []
    for word, tag in pos_tag(tokeniser.tokenize(text)):
        # Tag word as verb, nouns, etc, improves lemmatiser accuracy
        tag = tag.lower()[0]
        tag = tag if tag in ['a', 'r', 'n', 'v'] else None
        if tag:
            word = lemmatiser.lemmatize(word, tag)
        else:
            word = lemmatiser.lemmatize(word)
        if word not in stop_words:
            final_text.append(word)
    return ' '.join(final_text)


In [5]:
def gen_n_grams(
        df: pd.DataFrame, new_col_name="tweet_n_gram", min_len=1, max_len=3
):
    df[new_col_name] = df['tweet'].apply(lambda x: list(
        nltk.everygrams(x.split(' '), min_len=min_len, max_len=max_len))
    )
    return df


In [7]:
def print_topic_words(decomposer, vectoriser, n_words):
    for i, topic in enumerate(decomposer.components_):
        print(f'Top {n_words} words for topic #{i}:')
        print([vectoriser.get_feature_names_out()[i]
               for i in topic.argsort()[-n_words:]])
        print('\n')
