# Bullet ArXiv
 
Analysis of astronomy sub-fields with ArXiv paper abstracts.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
X_train = pd.read_csv('../data/processed/train.csv')
X_val = pd.read_csv('../data/processed/val.csv')
X_test = pd.read_csv('../data/processed/test.csv')

In [3]:
RANDOM_STATE = 0

## Document Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

from src.abstract import FormatText
from src.model import Tokenizer

STOP_WORDS = [
    "doi",
    "preprint",
    "copyright",
    "peer",
    "reviewed",
    "org",
    "https",
    "et",
    "al",
    "author",
    "figure",
    "rights",
    "reserved",
    "permission",
    "used",
    "using",
    "arxiv",
    "license",
    "fig",
    "fig.",
    "al.",
    "Elsevier",
    "PMC",
    "CZI",
]

In [None]:
def kmeans_pipeline(
    vectorizer_kwargs, svd_kwargs, kmeans_kwargs, random_state=0
):
    formatter = FormatText()
    vectorizer = TfidfVectorizer(**vectorizer_kwargs)
    svd = TruncatedSVD(**svd_kwargs, random_state=random_state)
    normalizer = Normalizer()
    kmeans = KMeans(**kmeans_kwargs, random_state=random_state)

    pipeline = Pipeline(
        [
            (
                "preprocessing",
                Pipeline(
                    [
                        ("formatter", formatter),
                        ("vectorizer", vectorizer),
                        ("svd", svd),
                        ("normalizer", normalizer),
                    ]
                ),
            ),
            ("kmeans", kmeans),
        ]
    )

    return pipeline


def kmeans_distortions(
    X,
    k_range,
    vectorizer_kwargs={},
    svd_kwargs={},
    kmeans_kwargs={},
    **kwargs
):
    from scipy.spatial.distance import cdist

    def distortion(X, estimator):
        X_svd = pipeline['preprocessing'].transform(X)
        kmeans = estimator['kmeans']
        return cdist(X_svd, kmeans.cluster_centers_, "euclidean").min(axis=1).sum() / X_svd.shape[0]
        
    distortions = []
    for i, k in enumerate(k_range):
        kmeans_kwargs['n_clusters'] = k
        pipeline = kmeans_pipeline(
            vectorizer_kwargs=vectorizer_kwargs,
            svd_kwargs=svd_kwargs,
            kmeans_kwargs=kmeans_kwargs,
            random_state=RANDOM_STATE,
        )
        pipeline.fit(X)
        distortions.append(distortion(X, pipeline))

    return distortions

def plot_kmeans_elbow(
    X,
    k_range,
    vectorizer_kwargs={},
    svd_kwargs={},
    kmeans_kwargs={},
    **kwargs
):
    fig, ax = plt.subplots(1, 1, figsize=(6, 4))
    distortions = kmeans_distortions(
        X,
        k_range,
        vectorizer_kwargs=vectorizer_kwargs,
        svd_kwargs=svd_kwargs,
        kmeans_kwargs=kmeans_kwargs,
        **kwargs
    )
    X_line = [k_range[0], k_range[-1]]
    Y_line = [distortions[0], distortions[-1]]

    ax.plot(k_range, distortions, "C0-")
    ax.plot(X_line, Y_line, "k--")
    ax.set_xlabel("k")
    ax.set_ylabel("Distortion")
    ax.set_title('KMeans Elbow Plot')
    return fig

In [None]:
fig = plot_kmeans_elbow(
    X_train,
    k_range=range(5, 50, 5),
    vectorizer_kwargs={
        "tokenizer": Tokenizer(user_stopwords=STOP_WORDS, language="english"),
        "max_df": 0.95,
        "min_df": 3,
        "ngram_range": (1, 1),
        "analyzer": "word",
    },
    svd_kwargs={"n_components": 100},
    kmeans_kwargs={},
    random_state=RANDOM_STATE,
    verbose=1,
)
fig.savefig('../figures/kmeans_elbow_plot.png', bbox_inches='tight', facecolor='white', edgecolor='none', transparent=False)

## Visualizing Clustering

In [None]:
from sklearn.manifold import TSNE

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


def tsne_pipeline(
    vectorizer_kwargs, svd_kwargs, kmeans_kwargs, tsne_kwargs, random_state=0
):
    formatter = FormatText()
    vectorizer = TfidfVectorizer(**vectorizer_kwargs)
    svd = TruncatedSVD(**svd_kwargs, random_state=random_state)
    normalizer = Normalizer(copy=False)
    kmeans = KMeans(**kmeans_kwargs, random_state=random_state)
    tsne = TSNE(**tsne_kwargs, random_state=random_state)
    pipeline = Pipeline(
        [
            (
                "preprocessing",
                Pipeline(
                    [
                        ("formatter", formatter),
                        ("vectorizer", vectorizer),
                        ("svd", svd),
                        ("normalizer", normalizer),
                    ]
                ),
            ),
            ("kmeans", kmeans),
            ("tsne", tsne)
        ]
    )

    return pipeline


def plot_tsne_results(X, vectorizer_kwargs, svd_kwargs, kmeans_kwargs, tsne_kwargs, random_state=0):
    sns.set(rc={"figure.figsize": (13, 9)})

    k_clusters = kmeans_kwargs.get("n_clusters")
    palette = sns.hls_palette(k_clusters, l=0.4, s=0.9)

    pipeline = tsne_pipeline(
        vectorizer_kwargs=vectorizer_kwargs,
        svd_kwargs=svd_kwargs,
        kmeans_kwargs=kmeans_kwargs,
        tsne_kwargs=tsne_kwargs,
        random_state=random_state
    )

    X_tsne = pipeline.fit_transform(X)
    kmeans_labels = pipeline["kmeans"].predict(pipeline['preprocessing'].transform(X))
    axes = sns.scatterplot(
        x=X_tsne[:, 0], y=X_tsne[:, 1], hue=kmeans_labels, legend="full", palette=palette
    )
    plt.title("t-SNE with Kmeans Labels")
    return axes


In [None]:
axes = plot_tsne_results(
    X_train,
    vectorizer_kwargs={
        "tokenizer": Tokenizer(user_stopwords=STOP_WORDS, language="english"),
        "max_df": 0.95,
        "min_df": 3,
        "ngram_range": (1, 1),
        "analyzer": "word",
    },
    svd_kwargs={"n_components": 100},
    kmeans_kwargs={"n_clusters": 30},
    tsne_kwargs={"perplexity": 50, "init": "random", "learning_rate": "auto"},
    random_state=RANDOM_STATE
)

## Cluster Labelling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from src.model import LDACluster

In [None]:
def lda_pipeline(
    X, n_clusters, user_stopwords, vectorizer_kwargs, lda_kwargs, random_state
):
    formatter = FormatText()
    vectorizer = Vectorizer(
        vectorizer="counts",
        vectorizer_kwargs=vectorizer_kwargs,
        user_stopwords=user_stopwords,
    )
    lda_cluster = LDACluster(
        n_clusters=n_clusters, random_state=random_state, lda_kwargs=lda_kwargs,
    )
    pipeline = Pipeline(
        [
            ("formatter", formatter),
            ("vectorizer", vectorizer),
            ("labeller", lda_cluster),
        ]
    )
    return pipeline


def lda_labels(pipeline, words_per_topic):
    labels = []
    feature_names = pipeline["vectorizer"].vectorizer.get_feature_names_out()
    for cluster_labeller in pipeline["labeller"].labellers:
        cluster_keywords = []
        for ix, topic in enumerate(cluster_labeller.components_):
            words = [
                feature_names[i] for i in topic.argsort()[: -words_per_topic - 1 : -1]
            ]
            cluster_keywords.append(words)
        labels.append(np.unique(cluster_keywords))
    return labels


def get_lda_labels(
    X,
    words_per_topic,
    user_stopwords={},
    vectorizer_kwargs={},
    pca_kwargs={},
    kmeans_kwargs={},
    lda_kwargs={},
    random_state=0,
):

    n_clusters = kmeans_kwargs.get("n_clusters")
    clustering_pipeline = kmeans_pipeline(
        user_stopwords=user_stopwords,
        vectorizer_kwargs=vectorizer_kwargs,
        svd_kwargs=pca_kwargs,
        kmeans_kwargs=kmeans_kwargs,
        random_state=random_state,
    )

    labelling_pipeline = lda_pipeline(
        X,
        n_clusters=n_clusters,
        user_stopwords=user_stopwords,
        vectorizer_kwargs=vectorizer_kwargs,
        lda_kwargs=lda_kwargs,
        random_state=random_state,
    )
    y = clustering_pipeline.fit_predict(X)
    labelling_pipeline.fit(X, y)
    return lda_labels(labelling_pipeline, words_per_topic)


In [None]:
labels = get_lda_labels(
    X_train,
    words_per_topic=5,
    user_stopwords=STOP_WORDS,
    vectorizer_kwargs={"max_df": 0.95, "min_df": 3, "ngram_range": (1, 1)},
    pca_kwargs={"n_components": 100},
    kmeans_kwargs={"n_clusters": 20},
    lda_kwargs={"n_components": 5, "learning_method": "online"},
)
