In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
X_train = pd.read_csv('../data/processed/train.csv')
X_val = pd.read_csv('../data/processed/val.csv')
X_test = pd.read_csv('../data/processed/test.csv')

X = X_train[-100:]

In [None]:
STOP_WORDS = [
    "doi",
    "preprint",
    "copyright",
    "peer",
    "reviewed",
    "org",
    "https",
    "et",
    "al",
    "author",
    "figure",
    "rights",
    "reserved",
    "permission",
    "used",
    "using",
    "arxiv",
    "license",
    "fig",
    "fig.",
    "al.",
    "Elsevier",
    "PMC",
    "CZI",
]


In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.preprocessing import Normalizer

from src.abstract import FormatText
from src.model import Vectorizer, SparsePCA, LDACluster

In [None]:
def kmeans_pipeline(
    vectorizer_kwargs, user_stopwords, svd_kwargs, kmeans_kwargs, random_state=0
):
    formatter = FormatText()
    vectorizer = Vectorizer(
        vectorizer="tf-idf",
        vectorizer_kwargs=vectorizer_kwargs,
        user_stopwords=user_stopwords,
    )
    svd = TruncatedSVD(**svd_kwargs, random_state=random_state)
    normalizer = Normalizer(copy=False)
    kmeans = KMeans(**kmeans_kwargs, random_state=random_state)

    pipeline = Pipeline(
        [
            (
                "features",
                Pipeline(
                    [
                        ("formatter", formatter),
                        ("vectorizer", vectorizer),
                        ("svd", svd),
                        ("normalizer", normalizer),
                    ]
                ),
            ),
            ("kmeans", kmeans),
        ]
    )

    return pipeline


def optimize_k_clusters(
    X, cluster_range, vectorizer_kwargs, svd_kwargs, user_stopwords
):
    from scipy.spatial.distance import cdist

    distortions = []
    for n_clusters in cluster_range:
        pipeline = kmeans_pipeline(
            vectorizer_kwargs=vectorizer_kwargs,
            svd_kwargs=svd_kwargs,
            kmeans_kwargs={"n_clusters": n_clusters},
            user_stopwords=user_stopwords,
        )
        pipeline.fit(X)
        X_pca = pipeline["features"].transform(X_train)
        distortions.append(
            sum(
                np.min(
                    cdist(X_pca, pipeline["kmeans"].cluster_centers_, "euclidean"),
                    axis=1,
                )
            )
            / X_pca.shape[0]
        )
    return distortions


def plot_k_cluster_elbow(
    X, cluster_range, vectorizer_kwargs={}, svd_kwargs={}, user_stopwords={}
):
    fig, ax = plt.subplots(1, 1, figsize=(6, 4))
    distortions = optimize_k_clusters(
        X,
        cluster_range,
        vectorizer_kwargs=vectorizer_kwargs,
        svd_kwargs=svd_kwargs,
        user_stopwords=user_stopwords,
    )
    X_line = [cluster_range[0], cluster_range[-1]]
    Y_line = [distortions[0], distortions[-1]]

    # Plot the elbow
    ax.plot(cluster_range, distortions, "b-")
    ax.plot(X_line, Y_line, "r")
    ax.set_xlabel("k")
    ax.set_ylabel("Distortion")


In [None]:
cluster_range = range(2, 30)
plot_k_cluster_elbow(
    X_train,
    cluster_range,
    vectorizer_kwargs={"max_df": 0.95, "min_df": 3, "ngram_range": (1, 1),},
    svd_kwargs={"n_components": 100},
    user_stopwords=STOP_WORDS,
)

In [None]:
def tsne_pipeline(
    user_stopwords={},
    vectorizer_kwargs={},
    pca_kwargs={},
    kmeans_kwargs={},
    tsne_kwargs={},
    random_state=0,
    verbose=0,
):
    formatter = FormatText()
    vectorizer = Vectorizer(
        vectorizer_kwargs=vectorizer_kwargs, user_stopwords=user_stopwords
    )
    pca = SparsePCA(**pca_kwargs, random_state=random_state)
    normalizer = Normalizer(copy=False)
    kmeans = KMeans(**kmeans_kwargs, random_state=random_state)
    tsne = TSNE(**tsne_kwargs, verbose=verbose)

    pipeline = Pipeline(
        [
            (
                "features",
                Pipeline(
                    [
                        ("formatter", formatter),
                        ("vectorizer", vectorizer),
                        ("pca", pca),
                        ("normalizer", normalizer),
                        ("kmeans", kmeans),
                    ]
                ),
            ),
            ("tsne", tsne),
        ]
    )
    return pipeline


def plot_tsne_results(X, user_stopwords={}):
    import seaborn as sns
    sns.set(rc={"figure.figsize": (13, 9)})

    palette = sns.hls_palette(20, l=0.4, s=0.9)

    pipeline = tsne_pipeline(
        user_stopwords=user_stopwords,
        vectorizer_kwargs={"max_df": 0.95, "min_df": 3, "ngram_range": (1, 1)},
        pca_kwargs={"n_components": 100},
        kmeans_kwargs={"n_clusters": 20},
        tsne_kwargs={"perplexity": 50, "init": "pca", "learning_rate": "auto"},
    )

    X_tsne = pipeline.fit_transform(X)
    hue = pipeline["features"].predict(X)
    sns.scatterplot(
        x=X_tsne[:, 0], y=X_tsne[:, 1], hue=hue, legend="full", palette=palette
    )
    plt.title("t-SNE with Kmeans Labels")

In [None]:
plot_tsne_results(X_train, STOP_WORDS)

In [None]:
def lda_pipeline(
    X, n_clusters, user_stopwords, vectorizer_kwargs, lda_kwargs, random_state
):
    formatter = FormatText()
    vectorizer = Vectorizer(
        vectorizer="counts",
        vectorizer_kwargs=vectorizer_kwargs,
        user_stopwords=user_stopwords,
    )
    lda_cluster = LDACluster(
        n_clusters=n_clusters, random_state=random_state, lda_kwargs=lda_kwargs,
    )
    pipeline = Pipeline(
        [
            ("formatter", formatter),
            ("vectorizer", vectorizer),
            ("labeller", lda_cluster),
        ]
    )
    return pipeline


def lda_labels(pipeline, words_per_topic):
    labels = []
    feature_names = pipeline["vectorizer"].vectorizer.get_feature_names_out()
    for cluster_labeller in pipeline["labeller"].labellers:
        cluster_keywords = []
        for ix, topic in enumerate(cluster_labeller.components_):
            words = [
                feature_names[i] for i in topic.argsort()[: -words_per_topic - 1 : -1]
            ]
            cluster_keywords.append(words)
        labels.append(np.unique(cluster_keywords))
    return labels


def get_lda_labels(
    X,
    words_per_topic,
    user_stopwords={},
    vectorizer_kwargs={},
    svd_kwargs={},
    kmeans_kwargs={},
    lda_kwargs={},
    random_state=0,
):

    n_clusters = kmeans_kwargs.get("n_clusters")
    clustering_pipeline = kmeans_pipeline(
        user_stopwords=user_stopwords,
        vectorizer_kwargs=vectorizer_kwargs,
        svd_kwargs=svd_kwargs,
        kmeans_kwargs=kmeans_kwargs,
        random_state=random_state,
    )

    labelling_pipeline = lda_pipeline(
        X,
        n_clusters=n_clusters,
        user_stopwords=user_stopwords,
        vectorizer_kwargs=vectorizer_kwargs,
        lda_kwargs=lda_kwargs,
        random_state=random_state,
    )
    y = clustering_pipeline.fit_predict(X)
    labelling_pipeline.fit(X, y)
    return lda_labels(labelling_pipeline, words_per_topic)


In [None]:
labels = get_lda_labels(
    X_train,
    words_per_topic=5,
    user_stopwords=STOP_WORDS,
    vectorizer_kwargs={"max_df": 0.95, "min_df": 3, "ngram_range": (1, 1)},
    svd_kwargs={"n_components": 100},
    kmeans_kwargs={"n_clusters": 20},
    lda_kwargs={"n_components": 5, "learning_method": "online"},
)
