Contains everything I've done so far, categorised for splitting into different files later

## Data Preprocessing

In [None]:
import nltk

resources = ['corpora/stopwords', 'corpora/wordnet',
             'taggers/averaged_perceptron_tagger']

def check_nltk_resources(resources: list[str]) -> None:
    """
    Download the necessary resources for nltk, such as stopwords
    """
    for resource in resources:
        # Find .zip file instead since nltk have problem unzipping files
        try:
            nltk.find(f'{resource}.zip')
        except LookupError:
            nltk.download(resource.split('/')[-1])

In [28]:
from typing import Iterable, Literal
import re
import string

from nltk.corpus import stopwords
from nltk import TweetTokenizer, WordNetLemmatizer, pos_tag
import pandas as pd


def preprocess(
        df: pd.DataFrame,
        txt_col="tweet",   # Specify column to clean
        stop_words=stopwords.words('english'),
        tokeniser=TweetTokenizer(),
        lemmatiser=WordNetLemmatizer(),
        filter_regex=r'',   # Used to filter matches
        remove_punct=True,
        remove_mentions=True,
        remove_hashtags=True,
        remove_urls=True,
        casing: Literal["lower", "upper", None] = 'lower'
):
    filter_regex = build_regex(
        filter_regex, remove_mentions, remove_hashtags, remove_urls
    )

    if remove_punct:
        stop_words.extend(string.punctuation)

    casing_func = lambda x: x   # Don't change casing
    if casing == 'lower':
        casing_func = str.lower
    elif casing == 'upper':
        casing_func = str.upper
    elif casing is not None:
        raise ValueError(
            "Parameter 'casing' can only have value: 'lower', 'upper', or None"
        )
        
    # Apply regex filter - remove all matched texts
    df[txt_col] = df[txt_col].apply(
        lambda txt: casing_func(re.sub(filter_regex, '', txt))
    )
    df[txt_col] = df[txt_col].apply(
        furnish, args=(tokeniser, lemmatiser, stop_words)
    )

    return df


def build_regex(
        regex: str, remove_mentions: bool, 
        remove_hashtags: bool, remove_urls: bool
):
    regex_mentions = r"(@[A-Za-z0-9_]+)"
    regex_hashtags = r"(#[A-Za-z0-9_]+)"
    regex_urls = \
        r"(https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\." + \
        r"[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*))"
    
    # Add '|" onto the end if regex string is not empty
    add_delim = lambda r: f"{r}{r'|' if len(r) != 0 else r''}"
    
    # Keep as list to reduce redundant code
    regexes = [regex_mentions, regex_hashtags, regex_urls]
    add_regex = [remove_mentions, remove_hashtags, remove_urls]
    for i in range(len(regexes)):
        regex = add_delim(regex)
        if add_regex[i]:
            regex += regexes[i]
   
    return regex

    
# Remove stopwords and turn word into lemmatised form
def furnish(text: str, tokeniser, lemmatiser, stop_words) -> str:
    final_text = []
    for word, tag in pos_tag(tokeniser.tokenize(text)):
        # Tag word as verb, nouns, etc, improves lemmatiser accuracy
        tag = tag.lower()[0]
        if tag in ['a', 'r', 'n', 'v']:
            word = lemmatiser.lemmatize(word, tag)
        else:
            word = lemmatiser.lemmatize(word)

        if word not in stop_words:
            final_text.append(word)

    return ' '.join(final_text)

## Data Loading

In [13]:
import pandas as pd

def load_twitter_csv(
        file_path: str,
        usecols=['conversation_id', 'tweet', 'language'],
        index_col=0,
        eng_only=True,
        do_preprocess=True
) -> pd.DataFrame:
    """
    Create (preprocessed) DataFrame from csv file containing only English tweets

    csv file MUST have a column named 'tweet'.
    
    Use preprocess() for more customisation.
    """
    df = pd.read_csv(file_path, index_col=index_col, usecols=usecols)
    
    # Filter out non-English tweets
    if eng_only:
        df.query('language == "en"', inplace=True)
        df.drop(columns=['language'], inplace=True)

    df.dropna(subset=['tweet'], inplace=True)

    if do_preprocess:
        df = preprocess(df)
    return df

---

## Text Vectorisation
- Converts textual data into numeric form as vectors, producing a **term matrix**

### Bag-Of-Word (BoW)
- Given a vocabulary, count up occurrence of each term in all documents and produce vector with those frequencies

#### `sklearn` - `CountVectorizer`

#### `gensim` - `Dictionary`

### TF-IDF - Term frequency-Inverse Document Frequency
- Vectoriser that take in words and produce a matrix containing weighting for each word
- the weight reflect how *important* a word is to a *document* in a collection
- For each word, the weight is the product of these two below:
    - **Term frequency**: How many times the term occurs, calculated for *each* document
    $$\text{tf}(t, d) = \frac{\text{raw count of term }t}{\text{sum of frequency for all terms in document }d}$$
    - **Inverse document frequency**: number of documents divided by number of documents the word occured in, scaled logarithmically
    $$\text{idf}(t, D) = \log \frac{\text{total number of documents in collection }D}{\text{number of documents that term }t \text{ occured in}}$$

#### `sklearn` - `TfidfVectorizer`

#### `gensim` - `TfidfModel`

### Latent Semantic Indexing (LSI)

---

## Topic Modelling

### Latent Dirichlet Allocation (LDA)

### K-Means Clustering

### Non-negative Matrix Factorisation (NMF)

---

## Visualisation