# [Gensim NLP trials](#gensim-nlp-trials)

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
import re
from itertools import combinations
from pathlib import Path
from time import time
from IPython.display import display

import altair as alt
import gensim.corpora as corpora
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.corpus import stopwords
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.decomposition import NMF
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [None]:
%aimport src.pipe_helpers
from src.pipe_helpers import TextCleaner

%aimport src.gensim_helpers
from src.gensim_helpers import (
    compute_coherence_values,
    make_bigrams,
    remove_stopwords,
    sent_to_words,
    format_topics_sentences,
    get_bigrams_trigrams,
    plot_coherence_scores,
    compute_coherence_values,
)

%aimport src.visualization_helpers
from src.visualization_helpers import (
    altair_datetime_heatmap,
    plot_horiz_bar,
    plot_horiz_bar_gensim,
)

In [None]:
SMALL_SIZE = 26
MEDIUM_SIZE = 28
BIGGER_SIZE = 30
plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rcParams["axes.facecolor"] = "white"
sns.set_style("darkgrid", {"legend.frameon": False})
sns.set_context("talk", font_scale=0.95, rc={"lines.linewidth": 2.5})

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
%matplotlib inline

<a id="toc"></a>

## [Table of Contents](#table-of-contents)
0. [About](#about)
1. [User Inputs](#user-inputs)
2. [Load joined data](#load-joined-data)
3. [Topic modeling using Gensim NMF with TFIDF vectorization](#topic-modeling-using-gensim-nmf-with-tfidf-vectorization)
4. [Topic modeling using Gensim NMF without TFIDF vectorization](#topic-modeling-using-gensim-nmf-without-tfidf-vectorization)
   - 4.1. [Pre-processing for Gensim NMF](#pre-processing-for-gensim-nmf)
   - 4.2. [Gensim NMF](#gensim-nmf)
   - 4.3. [Exploring Gensim NMF topics combined with source data](#exploring-gensim-nmf-topics-combined-with-source-data)

<a id="about"></a>

## 0. [About](#about)

In this notebook, we will experiment with NLP models on the joined news listings data in `data/processed/*_processed.csv`

<a id="user-inputs"></a>

## 1. [User Inputs](#user-inputs)

We'll define below the variables that are to be used throughout the code.

In [None]:
# Dataset
publication_name = "guardian"

# Data locations
data_dir_path = str(
    Path().cwd() / "data" / "processed" / f"{publication_name}_processed.csv"
)
cloud_run = True

# Custom stop words to include
manual_stop_words = ["nt", "ll", "ve"]

# Topic naming
gensim_non_tfidf_mapping_dict = {
    "guardian": {
        0: "Space Funding Bodies",
        1: "Rocket Launches - Moon Landing and ISS",
        2: "Discover of Sub-Atomic particles",
        3: "Mars Exploration",
        4: "Planetary Research",
        5: "Shuttle Missions and Crashes",
        6: "Academia",
        7: "Gravity and Black Holes - Hawking",
        8: "Black Body Radiation",
        9: "Dark Matter theories",
        10: "Pseudo space-science and Humanity - Opinion",
        11: "Studying Comets and Meteors",
        12: "Global Warming",
        13: "Space Funding Bodies",
        14: "Learning and Memory",
    }
}

# General inputs
limit = 25
start = 10
step = 1

In [None]:
def calculate_coherence(w2v_model, term_rankings):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations(term_rankings[topic_index], 2):
            pair_scores.append(w2v_model.wv.similarity(pair[0], pair[1]))
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)


def get_descriptor(all_terms, H, topic_index, top):
    # reverse sort the values to sort the indices
    top_indices = np.argsort(H[topic_index, :])[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(all_terms[term_index])
    return top_terms


class TokenGenerator:
    def __init__(self, documents, stopwords):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile(r"(?u)\b\w\w+\b")

    def __iter__(self):
        print("Building Word2Vec model ...")
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall(doc):
                if tok in self.stopwords:
                    tokens.append("<stopword>")
                elif len(tok) >= 2:
                    tokens.append(tok)
            yield tokens

In [None]:
# Get stop words from all packages
# NLTK
if not ((Path.cwd().parents[1]) / "nltk_data").exists():
    nltk.download("punkt")
    nltk.download("wordnet")
    nltk.download("stopwords")
    nltk.download("averaged_perceptron_tagger")
nltk_stop_words = set(stopwords.words("english"))
# Spacy and sklearn
spacy_stop_words = STOP_WORDS
sklearn_stop_words = stop_words.ENGLISH_STOP_WORDS

# Assemble manual list of stop words
spacy_not_in_sklearn = set(spacy_stop_words) - set(sklearn_stop_words)
nltk_not_in_sklearn = set(nltk_stop_words) - set(sklearn_stop_words)
all_stop_words = set(
    list(set(sklearn_stop_words))
    + list(spacy_not_in_sklearn)
    + list(nltk_not_in_sklearn)
)

# Manually add to stop words
for manual_stop_word in manual_stop_words:
    all_stop_words.add(manual_stop_word)

<a id="load-joined-data"></a>

## 2. [Load joined data](#load-joined-data)

We'll start by loading the joined data from from a publication, stored at `data/processed/<publication-name>_processed.csv`, into a `DataFrame`

In [None]:
df = pd.read_csv(Path(data_dir_path))
df = df[["text", "year"]]
print(df.shape[0])
display(df.head())
# print(corpus[0])

In [None]:
# df["text"] = df["text"].str.lower()

In [None]:
corpus_raw = df.loc[:, "text"].values.tolist()

In [None]:
# vectorizer = TfidfVectorizer(
#     tokenizer=None,
#     lowercase=True,
#     ngram_range=(1, 1),
#     stop_words=all_stop_words,
#     min_df=20,
#     max_features=None,
#     binary=False,
#     strip_accents=None,
# )
vectorizer = TfidfVectorizer(
    tokenizer=None,
    preprocessor=None,
    stop_words=all_stop_words,  # "all_stop_words" or "english"
    lowercase=True,
    ngram_range=(1, 1),
    max_df=1.0,
    min_df=1,
    max_features=None,
    binary=False,
    strip_accents="ascii",
    token_pattern="[a-z][a-z]+",
)
pipe = Pipeline(
    steps=[("cleaner", TextCleaner(split=False)), ("vectorizer", vectorizer),]
)

docs_terms = vectorizer.fit_transform(corpus_raw)
print(
    f"Created {docs_terms.shape[0]:0d} X {docs_terms.shape[1]:0d} TF-IDF-normalized document-term matrix"
)

In [None]:
terms = vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))

In [None]:
cell_st = time()

topic_models = []
for num_topics in range(start, limit + 1):
    print(f"Applying NMF with {num_topics:0d} topics...")
    model = NMF(n_components=num_topics, max_iter=200)
    model_transformed = model.fit_transform(docs_terms)
    factors_dict = model.components_
    topic_models.append((num_topics, model_transformed, factors_dict))

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

In [None]:
cell_st = time()

docgen = TokenGenerator(corpus_raw, all_stop_words)
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

In [None]:
print("Model has %d terms" % len(w2v_model.wv.vocab))

In [None]:
cell_st = time()

k_values = []
coherences = []
for (k, W, H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append(get_descriptor(terms, H, topic_index, 10))
    # Now calculate the coherence based on our Word2vec model
    k_values.append(k)
    coherences.append(calculate_coherence(w2v_model, term_rankings))
    print("K=%02d: Coherence=%.4f" % (k, coherences[-1]))

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

In [None]:
fig = plt.figure(figsize=(13, 7))
# create the line plot
ax = plt.plot(k_values, coherences)
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")
# add the points
plt.scatter(k_values, coherences, s=120)
# find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate(
    "k=%d" % best_k,
    xy=(best_k, ymax),
    xytext=(best_k, ymax),
    textcoords="offset points",
    fontsize=16,
)

In [None]:
k = 15
# get the model that we generated earlier.
W = topic_models[k - start][1]
H = topic_models[k - start][2]

In [None]:
for topic_index in range(k):
    descriptor = get_descriptor(terms, H, topic_index, 10)
    str_descriptor = ", ".join(descriptor)
    print("Topic %02d: %s" % (topic_index + 1, str_descriptor))

<a id="topic-modeling-using-gensim-nmf-without-tfidf-vectorization"></a>

## 4. [Topic modeling using Gensim NMF without TFIDF vectorization](#topic-modeling-using-gensim-nmf-without-tfidf-vectorization)

Here, we'll use Gensim's implementation of NMF, without TFIDF vectorization, to retrieve topics. This will be done without TFIDF Vectorization from either [`sklearn`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn-feature-extraction-text-tfidfvectorizer) or [`gensim`](https://radimrehurek.com/gensim/models/tfidfmodel.html#gensim.models.tfidfmodel.TfidfModel) itself.

<a id="pre-processing-for-gensim-nmf"></a>

### 4.1. [Pre-processing for Gensim NMF](#pre-processing-for-gensim-nmf)

First, we'll clean the text of the articles

In [None]:
cell_st = time()

pipe = Pipeline(steps=[("cleaner", TextCleaner(split=False))])
corpus_raw_cleaned = pipe.fit_transform(corpus_raw)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

We'll now tokenize the cleaned sentences into a list of words

In [None]:
cell_st = time()

# data_words = list(sent_to_words(corpus_raw))
data_words = list(sent_to_words(corpus_raw_cleaned))

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

We'll use Gensim's `Phrases` module to build bigram and trigram models

In [None]:
cell_st = time()

bigram_model, trigram_model = get_bigrams_trigrams(data_words, 5, 100)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

Next, we'll perform the following pre-processing
- remove stopwords
- (optional) create bigrams
- (optional) lemmatize

In [None]:
cell_st = time()

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words, all_stop_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(
    data_words_nostops, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]
)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

Now, we'll create a corpus comprising an assigned ID and corresponding frequency of words from the cleaned list of words (where stopwords were removed) above

In [None]:
cell_st = time()

# Create Dictionary
# id2word = corpora.Dictionary(data_lemmatized)
id2word = corpora.Dictionary(data_words_nostops)

# Term Document Frequency for corpus
# corpus = [id2word.doc2bow(text) for text in data_lemmatized]
corpus = [id2word.doc2bow(text) for text in data_words_nostops]

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds")

<a id="gensim-nmf"></a>

### 4.2. [Gensim NMF](#gensim-nmf)

We'll now train Gensim's NMF model. A helper function below will iterate over the number of topics and compute the coherence score for each number.

In [None]:
cell_st = time()

model_dict, coherence_values = compute_coherence_values(
    corpus=corpus,
    id2word=id2word,
    # texts=data_lemmatized,
    texts=data_words_nostops,
    limit=limit,
    start=start,
    step=step,
)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

The coherence scores are graphed below by number of topics used, with an [annotation showing](https://matplotlib.org/2.0.0/users/annotations.html#annotating-with-text-with-box) the number of topics with the highest coherence score

In [None]:
plot_coherence_scores(coherence_values, start, limit, step, (8, 6))

**Observations**
1. There is evidence of a increasing trend in the number of topics, with a periodicity appearing approx. every 5 topics. This could be an artifact of the choice of other hyperparameters chosen in the Gensim NMF model. Optimization of these could be more revealing for choosing the best NMF model here.
2. The best score occurs for 20 topics and is only approx. 0.03 larger than that for the number of topics used in NMF/CorEx approaches (15).

Since the difference between the previously used 15 topics and highest coherence score here (20) is so small, we'll use 15 topics for further exploration

In [None]:
best_model = model_dict[15]

Now, we'll print out all the topics found from the Gensim NMF model

In [None]:
twords = {}
for topic, word in best_model.show_topics(num_topics=len(gensim_non_tfidf_mapping_dict[publication_name]), num_words=10):
    words_cleaned = re.sub("[^A-Za-z ]+", "", word)
    twords[topic] = words_cleaned
    print(f"Topic {topic}:", words_cleaned.replace("  ", " "))

```
Topic 0: science scientific research scientists public people uk world technology new
Topic 1: people like says think dont time things going thats know
Topic 2: ice climate change energy carbon global sea particles water warming
Topic 3: time theory work physics scientific world human way quantum new
Topic 4: space said satellites satellite rocket earth launch orbit says company
Topic 5: universe matter dark particles light stars black gravitational particle theory
Topic 6: said dawkins university research years people work like memory black
Topic 7: says brain new research cells human work use university body
Topic 8: moon lunar earth surface said mission apollo time moons spacecraft
Topic 9: planet earth planets solar sun star asteroid orbit years astronomers
Topic 10: comet new solar mission spacecraft sun rosetta launch comets philae
Topic 11: stars life planets telescope light memory star cells way years
Topic 12: space station shuttle astronauts nasa mission astronaut crew flight russian
Topic 13: life water mars planet surface scientists said martian space atmosphere
Topic 14: mars nasa mission missions nasas landing going launch earth astronauts
```

In [None]:
cell_st = time()

_ = plot_horiz_bar_gensim(
    best_model,
    id2word,
    gensim_non_tfidf_mapping_dict[publication_name],
    fig_size=(40, 35),
)

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds")

We'll append the topic to the same row as each document in the original data

In [None]:
cell_st = time()

df_with_topics = format_topics_sentences(
    best_model, corpus, df, gensim_non_tfidf_mapping_dict[publication_name]
)
display(df_with_topics.head(2))

total_minutes, total_seconds = divmod(time() - cell_st, 60)
print(
    f"Cell exection time: {int(total_minutes):d} minutes, {total_seconds:.2f} seconds"
)

<a id="exploring-gensim-nmf-topics-combined-with-source-data"></a>

### 4.3. [Exploring Gensim NMF topics combined with source data](#exploring-gensim-nmf-topics-combined-with-source-data)

Here, we will show a heatmap of the most popular topic by year, found by Gensim's implementation of NMF (recall this was done above without TFIDF Vectorization)

In [None]:
topics_by_timeframe = (
    df_with_topics.groupby(["most_popular_topic", "year"])
    .size()
    .reset_index()
    .sort_values(by=["most_popular_topic", 0, "year"], ascending=False)
    .rename(columns={0: "count"})
)
topics_by_timeframe.head()

In [None]:
altair_datetime_heatmap(
    topics_by_timeframe,
    x="year:O",
    y="most_popular_topic:N",
    xtitle="Year",
    ytitle="Most popular topic",
    tooltip=[
        {"title": "Year", "field": "year", "type": "ordinal",},
        {
            "title": "Most popular topic",
            "field": "most_popular_topic",
            "type": "nominal",
        },
        {
            "title": "Number of occurrences as main topic",
            "field": "count",
            "type": "quantitative",
        },
    ],
    cmap="yelloworangered",
    legend_title="",
    color_by_col="count:Q",
    yscale="log",
    axis_tick_font_size=12,
    axis_title_font_size=16,
    title_font_size=20,
    legend_fig_padding=10,  # default is 18
    y_axis_title_alignment="left",
    fwidth=700,
    fheight=450,
    file_path=Path().cwd() / "reports" / "figures" / "my_heatmap.html",
    save_to_html=False,
    sort_y=[],
    sort_x=[],
)

Next, we will show a bar chart of the number of occurrences of the `"Space Funding Bodies"` as the most popular topic, relative to the year 1980
- this will approximate the public interest in changes in this topic over the years investigated

In [None]:
funds = (
    topics_by_timeframe[
        # topics_by_timeframe["most_popular_topic"] == "Space Funding Bodies"
        topics_by_timeframe["most_popular_topic"]
        == 0
    ]
    .set_index("year")["count"]
    .sort_index()
)
funds / funds.loc[funds.index.min()]
funds = funds / funds.loc[funds.index.min()]

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
funds.plot(kind="bar", ax=ax, rot=45, align="edge", width=0.8)
ax.set_title(
    "Cyclic variation in funding as main topic in article",
    fontsize=18,
    fontweight="bold",
)
ax.set_xlabel(None)
h = plt.ylabel("Funding\n(rel. to 1981)", labelpad=65, fontweight="bold")
h.set_rotation(0)

**Observations**
1. Both the `sklearn` NMF and CorEx implementations show a broadened peak centered at 2014 and a weaker+narrower peak in articles published under this topic in 2004. The latter appears here, and is the strongest peak, while the former (2014 peak) is not evident.