# Clustering Papers based on contents of their abstracts

Author: Rafael Ballestiero

In [None]:
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%matplotlib inline

plt.rcParams['figure.figsize']=[50,30]
plt.rcParams['font.size']=22
plt.rcParams['font.weight']='bold'
plt.rcParams['axes.titlesize'] = 28
plt.rcParams['axes.labelsize'] = 24

plt.style.use('seaborn-whitegrid')

## Data Cleaning

Construct preprocessed abstracts with custom filters.

In [None]:
from gensim.parsing import preprocessing
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, \
                                         strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text

from gensim.utils import has_pattern

import pattern.en as en

assert has_pattern()

In [None]:
df = pd.read_csv("./data/text_allyears.csv", header=0, index_col=0).dropna(subset=["Abstract"])

In [None]:
abbreviations = ["pss", "iot"]
def lemmatize(s):
    return " ".join([en.lemma(w) if w not in abbreviations else w for w in s.split()])

cp1252_pattern = re.compile(u"“|”|’|‘|—|\?")
def strip_cp1252_punctuation(s):
    return re.sub(cp1252_pattern, " ", s)

first_exclusion_common_terms = ["service", "innovation", "design", "customer", "services", "research", "study", "paper"]
second_exclusion_common_terms = ["service", "services", "research", "study", "paper", "result", "based", "literature", "article", "focus"]
def remove_common_terms(s, exclusion_terms=second_exclusion_common_terms):
    return " ".join([w for w in s.split() if w not in exclusion_terms])

In [None]:
def abstract_preprocessing(df, name="Abstract"):
    return df["Abstract"].apply(str).apply(preprocess_string, filters=[
        lambda x: x.lower(),
        strip_tags,
        strip_cp1252_punctuation,
        strip_punctuation, 
        strip_multiple_whitespaces, 
        strip_numeric, 
        remove_stopwords, 
        strip_short,
        lemmatize,
        remove_common_terms
    ]).reset_index(drop=True).rename(name)

In [None]:
preprocessed_abstracts = abstract_preprocessing(df)

## Clustering Methods

In [None]:
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as shc

In [None]:
def agglomerative_cluster(X, n_clusters):
    return shc.fcluster(shc.linkage(X, method='ward'), n_clusters, criterion='maxclust') - 1

def k_means_cluster(X, n_clusters, random_state=0):
    km_model = KMeans(n_clusters=n_clusters, random_state=random_state, n_jobs=-1)
    km_model.fit(X)
    return km_model.labels_

def cluster_algo_name(func):
    if func == agglomerative_cluster:
        return "agglomerative"
    elif func == k_means_cluster:
        return "k_means"

## Cluster Evaluation

### CH/DB score

In [None]:
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score

def univariate_method_evaluation(X, name, n_clusters_limit, score_metric, score_metric_name, agglomerative):
    assert n_clusters_limit >= 2
    
    k_range = range(2, n_clusters_limit)
    
    plt.title(f'{score_metric_name} Scores (max_k={n_clusters_limit})')
    
    if agglomerative:
        X_n = spatial.distance.squareform(X)
        cluster_algo = agglomerative_cluster
    else:
        X_n = X.toarray()
        cluster_algo = k_means_cluster

    scores = []
    for k in k_range:
        cluster_labels = cluster_algo(X, k)

        scores.append(score_metric(X_n, cluster_labels))

    plt.plot(k_range, scores, label=name)
    
    plt.legend()
    plt.savefig(f'plots/{name}/{score_metric_name}_scores.pdf')
    plt.show()
    
def calinski_harabasz_evaluation(X, name, n_clusters_limit, agglomerative=False):
    univariate_method_evaluation(X, name, n_clusters_limit, calinski_harabasz_score, "CalinskiHarabasz", agglomerative)
    
def davies_bouldin_evaluation(X, name, n_clusters_limit, agglomerative=False):
    univariate_method_evaluation(X, name, n_clusters_limit, davies_bouldin_score, "DaviesBouldin", agglomerative)

### Silhouette

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

def silhouette_evaluation(X, n_clusters, labels, name, cluster_algo=k_means_cluster, config_name="default", squareform=False):
    fig, ax = plt.subplots()

    cluster_labels = cluster_algo(X, n_clusters)
    
    if squareform:
        X = spatial.distance.squareform(X)
    
    silhouette_avg = silhouette_score(X, cluster_labels, sample_size=None)
    samples = silhouette_samples(X, cluster_labels)
    
    max_silhouette_score = np.max(samples)

    y_lower = 10
    for i in range(0, n_clusters):
        cluster_silhouette_scores = samples[cluster_labels == i]
        cluster_silhouette_scores.sort()

        cluster_size = cluster_silhouette_scores.shape[0]
        y_upper = y_lower + cluster_size

        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_scores)

        ax.text(-0.1 * max_silhouette_score, y_lower + 0.5 * cluster_size, str(i))

        y_lower = y_upper + 10  # 10 for the 0 samples

    plt.title(f'Silhouette Graph (k={n_clusters}) - {name} - {config_name}')

    ax.set_yticks([])
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.text(silhouette_avg + 0.01,20,f'silhouette_avg={np.round(silhouette_avg, 4)}')
    plt.savefig(f'plots/{name}/{n_clusters}/silhouette_{config_name}.pdf')
    plt.show()

#### TSNE

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

def tsne_evaluation(X, n_clusters, name, random_state=0, config_name="default", **kwargs):
    labels = k_means_cluster(X, n_clusters)
    X_truncated = TruncatedSVD(n_components=n_clusters, random_state=random_state).fit_transform(X)
    X_repr = TSNE(n_components=2, random_state=random_state, **kwargs).fit_transform(X_truncated)

    fig = plt.subplot()
    sns.scatterplot(X_repr[:, 0], X_repr[:, 1], s=1000, hue=labels, palette="Set3", legend="full")
    plt.title(f'TSNE 2-d Representation (k={n_clusters}) - {name} - {config_name}')

    plt.savefig(f'plots/{name}/{n_clusters}/tsne_{config_name}.pdf')
    plt.show()

## Cluster Themes

### TextRank

In [None]:
from gensim.summarization import keywords

def stemmed_keyword_mean_score(keywords):
    stemmed_scores = {}
    stemmed2kwd = {}
    
    for kwd, score in keywords:
        stemmed_kwd = stem_text(stem_text(kwd))
        
        if stemmed_kwd not in stemmed_scores:
            stemmed2kwd[stemmed_kwd] = kwd
            stemmed_scores[stemmed_kwd] = []

        stemmed_scores[stemmed_kwd].append(score)
        
    stemmed_scores = {stemmed2kwd[k]: np.mean(v) for (k, v) in stemmed_scores.items()} 
            
    return sorted(stemmed_scores.items(), key=(lambda x: (x[1], x[0])), reverse=True)

def text_rank_keyword_scores(clusters, word_count=6, debug=False, abstracts=preprocessed_abstracts, **kwds):
    result = []
    
    scored_keywords = abstracts.apply(" ".join)\
                               .apply(lambda s: re.sub(u"–", " ", s))\
                               .groupby(clusters)\
                               .apply(". ".join)\
                               .apply(lambda x: keywords(x, scores=True, **kwds))
    

    for group, kw_list in scored_keywords.iteritems():
        stemmed_keywords_seen = set([])
        
        for keyword, score in kw_list:
            if len(stemmed_keywords_seen) == word_count:
                break
                
            stemmed_keyword = stem_text(stem_text(keyword))
            
            if debug:
                print(", ".join([keyword, stemmed_keyword, str(score), str(stemmed_keywords_seen)]))
            
            if stemmed_keyword not in stemmed_keywords_seen:
                stemmed_keywords_seen.add(stemmed_keyword)
                result.append((group, keyword, score))
            
    return pd.DataFrame(result, columns=["cluster", "keyword", "score"])

### Tf-idf Transformer Scores

In [None]:
from gensim import corpora
from gensim.sklearn_api import TfIdfTransformer

def sum_tfidf_scores(dct, corpus, n=None):
    total_score = {}
    
    for document in corpus:
        sorted_doc = sorted(document, key=(lambda x: (x[1], x[0])), reverse=True)
        for kwd_id, tfidf_score in sorted_doc[:n]:
            kwd = dct[kwd_id]
            
            if kwd not in total_score:
                total_score[kwd] = 0
                
            total_score[kwd] += tfidf_score
            
    return sorted(total_score.items(), key=(lambda x: (x[1], x[0])), reverse=True)

def tfidf_transformer_keyword_scores(clusters, word_count=6, debug=False, exclude_words=[], abstracts=preprocessed_abstracts, **kwds):
    dct = corpora.Dictionary(abstracts)
    model = TfIdfTransformer(dictionary=dct)

    # train model on all documents
    all_docs_corpus = abstracts.apply(dct.doc2bow).tolist()
    model.fit(all_docs_corpus)

    # create corpus per cluster
    cluster_corpus = abstracts.groupby(clusters).apply(lambda x: [dct.doc2bow(abstract) for abstract in x])

    result = []

    for cluster_id, corpus in cluster_corpus.items():
        tfidf_corpus = model.transform(corpus)

        words_in_cluster = 0
        for keyword, score in sum_tfidf_scores(dct, tfidf_corpus):
            if keyword in exclude_words: continue
            result.append((cluster_id, keyword, score))
            words_in_cluster += 1
            
            if words_in_cluster == word_count:
                break

    return pd.DataFrame(result, columns=["cluster", "keyword", "score"])

### Tf-idf Vectorizer Scores

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def sorted_keyword_scores(model):
    return sorted(zip(model.get_feature_names(), model.idf_), key=lambda x: (x[1], x[0]))

tfidf_vectorizer_keyword_scores_cache = {}

def tfidf_vectorizer_keyword_scores(clusters, word_count=6, abstracts=preprocessed_abstracts):
    result = []
    
    abstract_strings = abstracts.apply(" ".join)
    
    if abstracts.name not in tfidf_vectorizer_keyword_scores_cache:
        model = TfidfVectorizer(tokenizer=None, ngram_range=(1, 2))
        model.fit(abstract_strings.values)
        
        tfidf_vectorizer_keyword_scores_cache[abstracts.name] = model
    else:
        model = tfidf_vectorizer_keyword_scores_cache[abstracts.name]
    
    for cluster_id, cluster_idx in abstracts.groupby(clusters).groups.items():
        abstracts = abstract_strings.loc[cluster_idx]
        
        abstracts_scores = model.transform(abstracts.values)
        abstracts_scores_df = pd.DataFrame.sparse.from_spmatrix(abstracts_scores, columns=model.get_feature_names())
        
        sorted_keyword_scores = abstracts_scores_df.mul(model.idf_).sum(0).sort_values(ascending=False)
        
        for keyword, score in sorted_keyword_scores.iloc[:word_count].iteritems():
            result.append((cluster_id, keyword, score))
        
        
    return pd.DataFrame(result, columns=["cluster", "keyword", "score"])

### Plotting

In [None]:
def keyword_scores_algo_name(func):
    if func == text_rank_keyword_scores:
        return "text_rank"
    elif func == tfidf_transformer_keyword_scores:
        return "tfidf_transformer"
    elif func == tfidf_vectorizer_keyword_scores:
        return "tfidf_vectorizer"
    
def plot_keyword_scores(X_scores, n_clusters, name, keyword_algo, config_name):
    fig, axes = plt.subplots(nrows=n_clusters)
        
    fig.suptitle(f"Cluster Themes (k={n_clusters}) - {name} - {keyword_scores_algo_name(keyword_algo)} - {config_name}", fontsize=35)
    fig.set_figheight(n_clusters * 5)
    
    for i, ax in enumerate(axes):
        cluster_num = i
        
        sns.barplot(
            x='keyword',  
            y='score',  
            data=X_scores[X_scores['cluster'] == cluster_num],
            ax=ax
        )

        ax.set_title(f"cluster={cluster_num}")
        ax.set_xlabel(None)
        ax.tick_params(axis='x', labelsize=40)
        
    fig.savefig(f'plots/{name}/{n_clusters}/keywords_{keyword_scores_algo_name(keyword_algo)}_{config_name}.pdf', format='pdf')

    plt.show()

def keyword_evaluation(X, n_clusters, name, cluster_algo=k_means_cluster, keyword_algo=tfidf_transformer_keyword_scores, config_name="default", **kwds):
    cluster_labels = cluster_algo(X, n_clusters)
    X_scores = keyword_algo(cluster_labels, **kwds)
    
    plot_keyword_scores(X_scores, n_clusters, name, keyword_algo, config_name)

## Graph Synthesis

In [None]:
from PyPDF2 import PdfFileMerger

def merge_graphs(basename, pdfname, configname, n_cluster_limit=20):
    merger = PdfFileMerger()
    
    for i in range(2, n_cluster_limit):
        pdf_path = f"./plots/{basename}/{i}/{pdfname}_{configname}.pdf"
        merger.append(pdf_path)

    merger.write(f"./plots/{basename}/all_{pdfname}_{configname}.pdf")
    merger.close()

# Perform Clustering

### Tf-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from nltk import word_tokenize, PorterStemmer

In [None]:
preprocessed_abstracts_strings = preprocessed_abstracts.apply(" ".join)

In [None]:
stemmer = PorterStemmer()

def tokenizer(text):
    return [stemmer.stem(token) for token in word_tokenize(text)]

#### Unigram

In [None]:
unigram_model = TfidfVectorizer(
    ngram_range=(1, 1),
    tokenizer=tokenizer
)
unigram_scores = unigram_model.fit_transform(preprocessed_abstracts_strings.values)

In [None]:
for i in range(2, 22):
    if str(i) not in os.listdir("./plots/tfidf"):
        os.mkdir(f"./plots/tfidf/{i}")
        
#     silhouette_evaluation(unigram_scores, i, "tfidf", config_name="only_unigrams")
    tsne_evaluation(unigram_scores, i, "tfidf", config_name="only_unigrams")
#     keyword_evaluation(unigram_scores, i, "tfidf", config_name="only_unigrams")
#     keyword_evaluation(unigram_scores, i, "tfidf", config_name="only_unigrams", keyword_algo=tfidf_vectorizer_keyword_scores)

In [None]:
# merge_graphs("tfidf", "silhouette", "only_unigrams", n_cluster_limit=22)
merge_graphs("tfidf", "tsne", "only_unigrams", n_cluster_limit=22)
# merge_graphs("tfidf", "keywords", "tfidf_transformer_only_unigrams", n_cluster_limit=22)
# merge_graphs("tfidf", "keywords", "tfidf_vectorizer_only_unigrams", n_cluster_limit=22)
# merge_graphs("tfidf", "keywords", "text_rank_only_unigrams", n_cluster_limit=22)

In [None]:
calinski_harabasz_evaluation(unigram_scores, "tfidf", 30)
davies_bouldin_evaluation(unigram_scores, "tfidf", 30)

#### Unigrams and Bigrams

In [None]:
unigram_and_bigram_model = TfidfVectorizer(
    ngram_range=(1, 2), # search for unigrams and bigrams
    tokenizer=tokenizer
)

unigram_and_bigram_scores = unigram_and_bigram_model.fit_transform(preprocessed_abstracts_strings.values)

In [None]:
for i in range(2, 20):
    if str(i) not in os.listdir("./plots/tfidf"):
        os.mkdir(f"./plots/tfidf/{i}")
#     silhouette_evaluation(unigram_and_bigram_scores, i, "tfidf", config_name="unigrams_bigrams")
    tsne_evaluation(unigram_and_bigram_scores, i, "tfidf", config_name="unigrams_bigrams")
#     keyword_evaluation(unigram_and_bigram_scores, i, "tfidf", config_name="unigrams_bigrams")
#     keyword_evaluation(unigram_and_bigram_scores, i, "tfidf", config_name="unigrams_bigrams", keyword_algo=tfidf_vectorizer_keyword_scores)

In [None]:
# merge_graphs("tfidf", "silhouette", "unigrams_bigrams")
merge_graphs("tfidf", "tsne", "unigrams_bigrams")
# merge_graphs("tfidf", "keywords", "tfidf_transformer_unigrams_bigrams")
# merge_graphs("tfidf", "keywords", "tfidf_vectorizer_unigrams_bigrams")

#### Multigrams

In [None]:
multigram_model = TfidfVectorizer(
    ngram_range=(1, 5),
    tokenizer=tokenizer
)

multigram_scores = multigram_model.fit_transform(preprocessed_abstracts_strings.values)

In [None]:
for i in range(2, 20):
    if str(i) not in os.listdir("./plots/tfidf"):
        os.mkdir(f"./plots/tfidf/{i}")
#     silhouette_evaluation(multigram_scores, i, "tfidf", config_name="multigrams")
    tsne_evaluation(multigram_scores, i, "tfidf", config_name="multigrams")
#     keyword_evaluation(multigram_scores, i, "tfidf", config_name="multigrams")

In [None]:
# merge_graphs("tfidf", "silhouette", "multigrams")
merge_graphs("tfidf", "tsne", "multigrams")
# merge_graphs("tfidf", "keywords", "multigrams")

### Word Mover's Distance on GloVe Embeddings

In [None]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile

from scipy import spatial

In [None]:
def convert_glove_2_w2v():
    tmp_file = get_tmpfile("test_word2vec.txt")

    glove2word2vec("./data/glove.6B/glove.6B.50d.txt", tmp_file)

    return KeyedVectors.load_word2vec_format(tmp_file)
    
# model = convert_glove_2_w2v()

In [None]:
def calculate_distance_matrix():
    s = preprocessed_abstracts.size
    result = np.ndarray((s,s))
    
    for i, a1 in enumerate(preprocessed_abstracts):
        print(f"({i}/{s}) Calculating distance for abstract...")
        for j, a2 in enumerate(preprocessed_abstracts.iloc[i:]):
            distance = model.wmdistance(a1, a2)
            result[i][i + j] = distance
            result[i + j][i] = distance
            print(f"D_({i}, {i + j})={distance}")
            
    return result

# abstract_distance_matrix = calculate_distance_matrix()

In [None]:
# store distance matrix
# pd.DataFrame(abstract_distance_matrix).to_csv('data/second_exclusion_distance_matrix.csv')
# load distance matrix from memory
abstract_distance_matrix = pd.read_csv('data/second_exclusion_distance_matrix.csv', index_col=0).values
# create the squareform
abstract_df = spatial.distance.squareform(abstract_distance_matrix)

#### Dendogram

In [None]:
plt.title("All Years - Second Exlusion - Dendrogram")
dend = shc.dendrogram(shc.linkage(abstract_df, method='ward'))
plt.savefig('plots/wmd_glove/dendrogram.pdf', format='pdf')

#### Evaluation

In [None]:
for i in range(2, 12):
    if str(i) not in os.listdir("./plots/wmd_glove"):
        os.mkdir(f"./plots/wmd_glove/{i}")
    silhouette_evaluation(abstract_df, i, "wmd_glove", cluster_algo=agglomerative_cluster, config_name="second_exclusion", squareform=True)
    keyword_evaluation(abstract_df, i, "wmd_glove", cluster_algo=agglomerative_cluster, config_name="second_exclusion")

### Second Exclusion

In [None]:
merge_graphs("wmd_glove", "silhouette", "second_exclusion", n_cluster_limit=12)
merge_graphs("wmd_glove", "keywords", "tfidf_transformer_second_exclusion", n_cluster_limit=12)

In [None]:
calinski_harabasz_evaluation(abstract_df, "wmd_glove", 12, agglomerative=True)
davies_bouldin_evaluation(abstract_df, "wmd_glove", 12, agglomerative=True)

## Save Clusters

In [None]:
df["wmd_glove_3"] = agglomerative_cluster(abstract_df, 3)
df["wmd_glove_5"] = agglomerative_cluster(abstract_df, 5)
for i in range(12, 22):
    df[f"tfidf_only_unigrams_{i}"] = k_means_cluster(unigram_scores, i)

In [None]:
df.to_csv('results/tfidf_unigrams.csv', index=False)

## Alternatives

### Topic Modeling

It seems that there is not enough data available in our dataset (only ~300 paragraphs) to provide interesting results for topic modeling algorithms.

In [None]:
from gensim.models import LdaMulticore

In [None]:
dct = Dictionary(preprocessed_abstracts)
corpus = [dct.doc2bow(abstract) for abstract in preprocessed_abstracts]
lda = LdaMulticore(corpus, id2word=dct, num_topics=6)

In [None]:
lda.show_topics()

### People-Centric papers

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
people_centric_df = pd.read_csv("./archive/people_centric/data/abstracts.csv", index_col=0)

In [None]:
people_centric_abstracts = abstract_preprocessing(people_centric_df, name="people_centric")

In [None]:
people_centric_model = TfidfVectorizer(
    ngram_range=(1, 1),
    tokenizer=tokenizer
)
people_centric_scores = people_centric_model.fit_transform(people_centric_abstracts.apply(" ".join).values)

In [None]:
for i in range(2, 12):
#     if str(i) not in os.listdir("./plots/people_centric"):
#         os.mkdir(f"./plots/people_centric/{i}")
#     silhouette_evaluation(people_centric_scores, i, "people_centric")
    tsne_evaluation(people_centric_scores, i, "people_centric")
#     keyword_evaluation(people_centric_scores, i, "people_centric", abstracts=people_centric_abstracts)
#     keyword_evaluation(people_centric_scores, i, "people_centric", abstracts=people_centric_abstracts, keyword_algo=tfidf_vectorizer_keyword_scores)

In [None]:
# merge_graphs("people_centric", "silhouette", "default", n_cluster_limit=12)
merge_graphs("people_centric", "tsne", "default", n_cluster_limit=12)
# merge_graphs("people_centric", "keywords", "tfidf_transformer_default", n_cluster_limit=12)
# merge_graphs("people_centric", "keywords", "tfidf_vectorizer_default", n_cluster_limit=12)

In [None]:
people_centric_df["cluster_6"] = k_means_cluster(people_centric_scores, 6)

In [None]:
people_centric_df.to_csv("./results/people_centric_clusters.csv", index=False)