Contains everything I've done so far, categorised for splitting into different files later

## Data Preprocessing

In [None]:
import nltk


def check_nltk_resources(resources: list[str]) -> None:
    """
    Download the necessary resources for nltk, such as stopwords
    """
    for resource in resources:
        # Find .zip file instead since nltk have problem unzipping files
        try:
            nltk.find(f'{resource}.zip')
        except LookupError:
            nltk.download(resource.split('/')[-1])

In [None]:
from typing import Iterable, Literal
import re
import string

from nltk.corpus import stopwords
from nltk import TweetTokenizer, WordNetLemmatizer, pos_tag
import pandas as pd


def preprocess(
        df: pd.DataFrame,
        txt_col="tweet",   # Specify column to clean
        stop_words=stopwords.words('english'),
        tokeniser=TweetTokenizer(),
        lemmatiser=WordNetLemmatizer(),
        filter_regex=r'',   # Used to filter matches
        remove_punct=True,
        remove_mentions=True,
        remove_hashtags=True,
        remove_urls=True,
        casing: Literal["lower", "upper", None] = 'lower'
):
    filter_regex = build_regex(
        filter_regex, remove_mentions, remove_hashtags, remove_urls
    )
    if remove_punct:
        stop_words.extend(string.punctuation)

    casing_func = lambda x: x   # Don't change casing
    if casing == 'lower':
        casing_func = str.lower
    elif casing == 'upper':
        casing_func = str.upper
    elif casing is not None:
        raise ValueError(
            "Parameter 'casing' can only have value: 'lower', 'upper', or None"
        )
        
    # Apply regex filter - remove all matched texts
    df[txt_col] = df[txt_col].apply(
        lambda txt: casing_func(re.sub(filter_regex, '', txt))
    )
    df[txt_col] = df[txt_col].apply(
        furnish, args=(tokeniser, lemmatiser, stop_words)
    )

    return df


def build_regex(
        regex: str, remove_mentions: bool, 
        remove_hashtags: bool, remove_urls: bool
):
    regex_mentions = r"(@[A-Za-z0-9_]+)"
    regex_hashtags = r"(#[A-Za-z0-9_]+)"
    regex_urls = \
        r"(https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\." + \
        r"[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*))"
    
    # Add '|" onto the end if regex string is not empty
    add_delim = lambda r: f"{r}{r'|' if len(r) != 0 else r''}"
    
    # Keep as list to reduce redundant code
    regexes = [regex_mentions, regex_hashtags, regex_urls]
    add_regex = [remove_mentions, remove_hashtags, remove_urls]
    for i in range(len(regexes)):
        regex = add_delim(regex)
        if add_regex[i]:
            regex += regexes[i]
   
    return regex

    
# Remove stopwords and turn word into lemmatised form
def furnish(text: str, tokeniser, lemmatiser, stop_words) -> str:
    final_text = []
    for word, tag in pos_tag(tokeniser.tokenize(text)):
        # Tag word as verb, nouns, etc, improves lemmatiser accuracy
        tag = tag.lower()[0]
        if tag in ['a', 'r', 'n', 'v']:
            word = lemmatiser.lemmatize(word, tag)
        else:
            word = lemmatiser.lemmatize(word)

        if word not in stop_words:
            final_text.append(word)

    return ' '.join(final_text)

## Data Loading

In [None]:
import pandas as pd

def load_twitter_csv(
        file_path: str,
        usecols=['conversation_id', 'tweet', 'language'],
        index_col=0,
        eng_only=True,
        do_preprocess=True
) -> pd.DataFrame:
    """
    Create (preprocessed) DataFrame from csv file containing only English tweets

    csv file MUST have a column named 'tweet'.
    
    Use preprocess() for more customisation.
    """
    df = pd.read_csv(file_path, index_col=index_col, usecols=usecols)
    
    # Filter out non-English tweets
    if eng_only:
        df.query('language == "en"', inplace=True)
        df.drop(columns=['language'], inplace=True)

    df.dropna(subset=['tweet'], inplace=True)

    if do_preprocess:
        df = preprocess(df)
    return df

In [None]:
resources = ['corpora/stopwords', 'corpora/wordnet',
             'taggers/averaged_perceptron_tagger']

check_nltk_resources(resources)

default_stopwords = stopwords.words('english')

default_stopwords.extend(
    list(string.punctuation) + [
        'would', 'could', 'get', 'want', 'he', 'twitter', 'elon', 'musk', 
        'well', 'need', 'come', 'really', 'take', 'say', 'go', 'use', 'make',
        'know', 'think'

    ]
)

df = load_twitter_csv("../Dataset/twitter.csv", do_preprocess=False)
df = preprocess(df, stop_words=default_stopwords)

---

## Text Vectorisation
- Converts textual data into numeric form as vectors, producing a **term matrix**
- **term matrix** is in this form:
 
|       | term_1 | term_2 | term_3 | ... |
| ----- | ------ | ------ | ------ | --- |
| doc_1 |        |        |        |     |
| doc_2 |        |        |        |     |
| doc_3 |        |        |        |     |
| ...   |        |        |        |     |


- **NOTE**: In `sklearn` vectoriser will produce a term matrix and **store the feature names**
- **NOTE**: `gensim`'s models perform transformations, a wrapper is created around the corpus and the transformation occurs on-the-fly
    - e.g. `tfidf_model[bow_corpus]` is a TF-IDF wrapper around the BoW vectors

In [None]:
corpus_1 = ["dogs and cats are not allowed", 
          "cats and cats are friendly and are allowed"]
corpus = df['tweet']
min_df = 10
max_df = 0.95

### Bag-Of-Word (BoW)
- Given a vocabulary, count up occurrence of each term in all documents and produce vector with those frequencies

#### `sklearn` - `CountVectorizer`

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(min_df=min_df, max_df=max_df)
bow_matrix1 = count_vect.fit_transform(corpus)

print(count_vect.get_feature_names_out())
print(bow_matrix1.toarray())

##### Bag-of-$n$-gram Variant

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(ngram_range=(2, 2))
bow_matrix1 = count_vect.fit_transform(corpus)

print(count_vect.get_feature_names_out())
print(bow_matrix1.toarray())

#### `gensim` - `Dictionary`

In [None]:
from gensim.corpora import Dictionary

# Associate each word with a unique integer ID
vocab = Dictionary([t.split() for t in corpus])
#vocab.filter_extremes(no_below=5, no_above=0.5)

# Use vocab as feature labels, create bag-of-word vectors
bow_matrix2 = [vocab.doc2bow(t.split()) for t in corpus]


### TF-IDF - Term frequency-Inverse Document Frequency
- Vectoriser that take in words and produce a matrix containing weighting for each word
- the weight reflect how *important* a word is to a *document* in a collection
- For each word, the weight is the product of these two below:
    - **Term frequency**: How many times the term occurs, calculated for *each* document
    $$\text{tf}(t, d) = \frac{\text{raw count of term }t}{\text{sum of frequency for all terms in document }d}$$
    - **Inverse document frequency**: number of documents divided by number of documents the word occured in, scaled logarithmically
    $$\text{idf}(t, D) = \log \frac{\text{total number of documents in collection }D}{\text{number of documents that term }t \text{ occured in}}$$

#### `sklearn` - `TfidfVectorizer`

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=min_df, max_df=max_df)
tfidf_matrix1 = tfidf_vect.fit_transform(corpus)

print(tfidf_vect.get_feature_names_out())
print(tfidf_matrix1.toarray())

#### `gensim` - `TfidfModel`

In [None]:
from gensim.models import TfidfModel

# Train the model on the corpus, must be BoW
tfidf_model = TfidfModel(bow_matrix2)
tfidf_matrix2 = tfidf_model[bow_matrix2]

### Latent Semantic Analysis
- **TODO**: Explanation

In [None]:
from sklearn.decomposition import L

---

## Topic Modelling

In [None]:
# Settings
term_matrix = bow_matrix1
n_topics = 10
iterations = 100

### Latent Dirichlet Allocation (LDA)

#### `sklearn` - `LatentDirichletAllocation`

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda_sklearn = LatentDirichletAllocation(n_components=n_topics, max_iter=iterations)
lda_matrix = lda_sklearn.fit_transform(term_matrix)

#### `gensim.models` - `LdaModel`

In [None]:
from gensim.models import LdaModel

temp = vocab[0] # Load the dictionary, idk why this is necessary???

lda_gensim = LdaModel(
    corpus=bow_matrix2,
    id2word=vocab.id2token,
    chunksize=3000,
    iterations=iterations,   # Loop over EACH document
    num_topics=n_topics,
    passes=20,               # Loop over WHOLE corpus
    eval_every=1
)

lda_gensim_topics = lda_gensim.top_topics(bow_matrix2)

### Non-negative Matrix Factorisation (NMF)

#### `sklearn` - `NMF`

In [1]:
from sklearn.decomposition import NMF

nmf_sklearn = NMF(n_components=n_topics)
nmf_matrix = nmf_sklearn.fit_transform(
    np.asarray(term_matrix.todense())
)

NameError: name 'n_topics' is not defined

### Latent Semantic Analysis (LSA)

#### `sklearn` - `TruncatedSVD`

In [None]:
from sklearn.decomposition import TruncatedSVD

lsa_sklearn = TruncatedSVD(n_components=n_topics, n_iter=iterations)
lsa_matrix = lsa_sklearn.fit_transform(tfidf_matrix1)

#### `gensim` - `LsiModel`

In [None]:
from gensim.models import LsiModel

lsi_gensim = LsiModel(tfidf_matrix2, id2word=vocab, num_topics=n_topics)
lsi_matrix = lsi_gensim[tfidf_matrix2]

lsi_gensim.print_topics(n_topics)

### K-Means Clustering

### Optimal Cluster
- `KMeans` privides the `inertia_` attribute - sum of square distances of samples to their cluster's centre, i.e. the **SSE**
    - SSE is a measure for how fitted the model is to the data, low SSE means the model is very fitted
    - Cluster size where the SSE starts to level off is optimal (don't want too low as that may mean overfitting)

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


def find_optimal_k_cluster_size(data, max_k: int = 20, rand_state=20):
    """
    Iterate through each cluster size up to max_k and plot SSE of each clusters
    """
    k_vals = range(2, max_k + 1, 2)
    
    sse = []
    for k in k_vals:
        sse.append(
            KMeans(
                n_clusters=k, random_state=rand_state, n_init='auto'
            ).fit(data).inertia_
        )
        print(f"Fitted {k} clusters!")
    
    # Plot the graph
    f, ax = plt.subplots(1, 1)
    ax.plot(k_vals, sse, marker='o')
    ax.set_xlabel('Number of Cluster Centroids')
    ax.set_xticks(k_vals)
    ax.set_xticklabels(k_vals)
    ax.set_ylabel('SSE')
    ax.set_title("Change in SSE as Number of Clusters Increase")

In [None]:
from sklearn.manifold import TSNE
import numpy as np
from sklearn.decomposition import PCA

import matplotlib.cm as cm

# Enable interactive graph
%matplotlib widget

from typing import Any


def plot_clusters(
    data, 
    cluster_labels, 
    n_samples=2000, 
    decomposer: Literal['pca', 'tsne'] | Any = 'pca',
    dimension: Literal[2, 3] = 2
):
    """
    data: matrix of n features
    cluster_labels: n-lengthed array with i-th value identify cluster for i-th data point
    n_samples: number of data points to plot from data
    decomposer: used to reduce data dimension to 2 or 3 for plotting, can pass in custom decomposer
    dimension: specify whether to display 2D or 3D graph
    """
    if dimension not in [2, 3]:
        raise ValueError(
            "Parameter 'dimension' can only have value 2 or 3"
        )
    
    n_clusters = max(cluster_labels)
    samples = np.random.choice(
        range(data.shape[0]), size=n_samples, replace=False
    )

    # Convert data to the appropriate type and sample size
    if type(data) != np.ndarray:
        data = np.asarray(data[samples, :].todense())
    else:
        data = data[samples, :]

    if decomposer == 'pca':
        transformed = PCA(n_components=dimension).fit_transform(data)
    elif decomposer == 'tsne':
        transformed = TSNE(n_components=dimension).fit_transform(data)
    else:
        transformed = decomposer.fit_transform(data)
    
    cluster_colours = [
        cm.hsv(i / n_clusters) for i in cluster_labels[samples][:]
    ]

    if dimension == 2:
        ax = plt.figure().add_subplot(projection=None)
        ax.scatter(transformed[:, 0], transformed[:, 1], c=cluster_colours)
    else: 
        ax = plt.figure().add_subplot(projection='3d')
        ax.scatter(
            transformed[:, 0], transformed[:, 1], transformed[:, 2], 
            c=cluster_colours
        )

    ax.set_title(f"{decomposer.upper()} Clusters {dimension}D")
    plt.show()


In [None]:
from sklearn.decomposition import PCA
import numpy as np

pca_matrix = PCA(n_components=7).fit_transform(
    np.asarray(tfidf_matrix1.todense())
)

In [None]:
from sklearn.cluster import SpectralClustering
find_optimal_k_cluster_size(pca_matrix, rand_state=357)

- For a dataset of $n$ samples, `clusters` is a $n$-lengthed array where the $i$-th number in the array identify the $i$-th sample of which cluster it belongs to
    - e.g. `clusters = [3, 2, 2, 0]`, the 0-th data point belongs to cluster 3

In [None]:
kmeans = SpectralClustering(n_clusters = 8)
clusters = kmeans.fit_predict(pca_matrix)

In [None]:
plot_clusters(pca_matrix, clusters, dimension=3, n_samples=4000)

### TODO: Find a way to plot the top $n$ words PER cluster - more frequent words
- NEED TO MAKE CLUSTERS USEFUL
- how to find frequency of word in each clusters?
- can we add OR multiply all dimensions for each feature/term to get some aggregate score, and then sort them by their cluster number? e.g. max(cluster['cluster_4'])

---

## Result Validation

### Topic Coherence

#### `gensim` - `Coherence Model`

In [None]:
from gensim.models import CoherenceModel

lda_u_mass = CoherenceModel(
    model=lda_gensim, corpus=bow_matrix2, dictionary=vocab, coherence='u_mass'
)

lda_c_v = CoherenceModel(
    model=lda_gensim, texts=corpus, dictionary=vocab, coherence='c_v'
)

print(lda_u_mass.get_coherence())
print(lda_c_v.get_coherence())

In [None]:
avg_topic_coherence = sum([t[1] for t in lda_gensim_topics]) / n_topics
print(f'Average topic coherence: {avg_topic_coherence}')

---

## Visualisation

In [None]:
import pyLDAvis

pyLDAvis.enable_notebook()

gensim_model = tfidf_vect
term_matrix = bow_matrix1

### Visualsing `sklearn`

In [None]:
import pyLDAvis.sklearn

count_vect.get_feature_names = count_vect.get_feature_names_out
pyLDAvis.sklearn.prepare(lda_sklearn, term_matrix, count_vect)

### Visualising `gensim`

In [None]:
import pyLDAvis.gensim_models

pyLDAvis.gensim_models.prepare(gensim_model, term_matrix, vocab)

### Plotting Term Matrices
- **TODO** - reuse cluster plotting function

---