# Clustering Papers based on contents of their abstracts

Author: Rafael Ballestiero

In [306]:
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%matplotlib inline

plt.rcParams['figure.figsize']=[50,30]
plt.rcParams['font.size']=22
plt.rcParams['font.weight']='bold'
plt.rcParams['axes.titlesize'] = 28
plt.rcParams['axes.labelsize'] = 24

plt.style.use('seaborn-whitegrid')

## Data Cleaning

Construct preprocessed abstracts with custom filters.

In [None]:
from gensim.parsing import preprocessing
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, \
                                         strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text

from gensim.utils import has_pattern

import pattern.en as en

assert has_pattern()

In [None]:
df = pd.read_csv("./data/text_allyears.csv", header=0, index_col=0).dropna(subset=["Abstract"])

### Remove Duplicate Rows

In [None]:
duplicate_references = ["Tong, C., Nagarajan, M., & Cheng, Y. (2016). Operational impact of service innovations in ",  "Easton, F. F., & Pullman, M. E. (2001). Optimizing service attributes: The seller\'s utility problem", "Customer efficiency, channel usage, and firm performance in retail banking"]

bad_indexes = []
for ref in duplicate_references:
    bad_indexes.append(df[df["Reference"].apply(lambda x: ref in x)].index[0])
    
# df.drop(bad_indexes).reset_index(drop=True).to_csv("./data/text_allyears.csv")

### Clean References

In [None]:
title_pattern = re.compile(u"[^\d]*\([0-9]{4}\)\.?\s*([^\.]+\.?).*")
def parse_title(reference):
    match = re.match(title_pattern, reference)
    
    if match is None:
        print(reference)
        raise Exception("Bad pattern")
        
    return match.groups()[0]

def reference_preprocessing(df):
    return df["Reference"].apply(parse_title)

In [None]:
# title before year
df.loc[12, "Reference"] = "(2002). Director's forum. Laboratory Equipment, 38(12), 8. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=6395811&site=ehost-live"

# 2.0 cuts title short
df.loc[30, "Reference"] = 'Mohamed, A. (2007). Switch to web 2-0 boosts business agility for internet services firm. Computer Weekly, , 12-12. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=25040565&site=ehost-live'

# title before year
df.loc[32, "Reference"] = '(2007). Translucent green: Environmentally-friendly manufacturing processes are key concern of retailers and brands. Textile World, 157(6), 49-49. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=27662701&site=ehost-live'

# title before year
df.loc[38, "Reference"] = '(2008). Spiegel expands use of yunique software. Textile World, 158(4), 56-56. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=33767368&site=ehost-live'

# title before year
df.loc[47, "Reference"] = '(2009). Service experience and service design: Concepts and application in tourism SMEs. Managing Service Quality, 19(3), 332-349. doi:10.1108/09604520910955339 Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=39983391&site=ehost-live'

# no period to finish title
df.loc[124, "Reference"] = 'Comerio, M., Batini, C., Castelli, M., Grega, S., Rossetti, M., & Viscusi, G. (2015). Service portfolio management: A repository-based framework. doi:10.1016/j.jss.2015.01.055'

# title before year
df.loc[148, "Reference"] = "(2016). How to create a 'lights out' customer experience. Ivey Business Journal, , 5-7. Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=113229191&site=ehost-live"

# no period to finish title
df.loc[155, "Reference"] = 'Peng, K., & Lin, P. M. C. (2016). Social entrepreneurs: Innovating rural tourism through the activism of service science. doi:10.1108/IJCHM-12-2014-0611'

# no period to finish title
df.loc[205, "Reference"] = 'Woo, J. (2017). How Chinese commercial banks innovate: process and practice. Journal of Innovation Management, 5(2), 81-110.'

# title before year
df.loc[211, "Reference"] = '(2018). Achieving competitive advantage. Strategic Direction, 34(10), 25-27. doi:10.1108/SD-06-2018-0145 Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=132294917&site=ehost-live'

# 4.0 cuts title short
df.loc[228, "Reference"] = 'MOHELSKA, H., & SOKOLOVA, M. (2018). Management approaches for industry 4-0 - the organizational culture perspective. Technological & Economic Development of Economy, 24(6), 2225-2240. doi:10.3846/tede.2018.6397 Retrieved from http://ezproxy.insead.edu:80/login?url=https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=133233122&site=ehost-live'

# df.to_csv("./data/text_allyears.csv")

### Preprocess Abstracts

In [None]:
from nltk import word_tokenize, PorterStemmer

abbreviations = set(["pss", "iot", "abs", "business", "exs", "ict", "npd", "bmi", "iot", "cem", "sst", "ic", "qos", "oi", "om", "psf", "ai", "bm", "bo", "mc", "mosp", "msd", "pssldm", "fcbpss", "ffe", "fmea", "fo", "iis", "sc", "sdl", "si", "sp", "vsm", "xe", "cad", "cdf", "clscs", "cmm", "cx", "ks", "odf", "sspss", "bma", "bpm", "bsc", "exs", "fof", "kibs", "lbd", "lo", "moss", "plm", "pnss", "prs", "qfd", "sem", "som", "sp", "acm", "adkar", "catwoe", "cc", "ces", "cis", "cit", "clv", "cxm", "dfsi", "dsic", "fepss", "fsqca", "ilp", "moa", "mosc", "ri", "rpn", "rrs", "rsp", "scm", "slr", "sna", "spesa", "spss", "ssm", "sta", "tr", "vo", "wips"])
def lemmatize(s):
    return " ".join([en.lemma(w) if w not in abbreviations else w for w in s.split()])

cp1252_pattern = re.compile(u"“|”|’|‘|—|–|–|\?")
def strip_cp1252_punctuation(s):
    return re.sub(cp1252_pattern, " ", s)

first_exclusion_common_terms = ["service", "innovation", "design", "customer", "services", "research", "study", "paper"]
second_exclusion_common_terms = ["service", "services", "research", "study", "paper", "result", "based", "literature", "article", "focus"]
def remove_common_terms(s, exclusion_terms):
    return " ".join([w for w in s.split() if w not in exclusion_terms])

synonyms = {"servitization": "servitisation", "consumer": "customer"}
def standardize_synonyms(s):
    return " ".join([synonyms.get(w, w) for w in s.split()])

stemmer = PorterStemmer()
def tokenizer(text):
    return [stemmer.stem(token) if token not in abbreviations else token for token in word_tokenize(text)]

def build_stem_to_token(iterator):
    frequency_map = {}
    
    def do_stem_to_token(agg):
        try:
            _, text = next(iterator)
            
            for token in word_tokenize(text):
                frequency_map[token] = frequency_map.get(token, 0) + 1
                
                if token not in abbreviations:
                    stem = stemmer.stem(token)
                    
                    # only add token to aggregator if (i) not present or (ii) token is more popular than current one
                    if stem not in agg.keys() or frequency_map[token] > frequency_map[agg[stem]]:
                        agg[stem] = token

            return do_stem_to_token(agg)
        except StopIteration:
            return agg
        
    result = {}
    return do_stem_to_token(result)
    

In [None]:
def abstract_preprocessing(df, dfname="Abstract", include_title=False, exclusion_terms=second_exclusion_common_terms):
    if include_title:
        raw_data = reference_preprocessing(df) + " " + df["Abstract"]
    else:
        raw_data = df["Abstract"]
    
    return raw_data.apply(str).apply(preprocess_string, filters=[
        lambda x: x.lower(),
        strip_tags,
        strip_cp1252_punctuation,
        strip_punctuation, 
        strip_multiple_whitespaces, 
        strip_numeric, 
        remove_stopwords, 
        strip_short,
        lemmatize,
        lambda x: remove_common_terms(x, exclusion_terms),
        standardize_synonyms
    ]).reset_index(drop=True).rename(dfname)

In [None]:
preprocessed_abstracts = abstract_preprocessing(df, include_title=True)

## Clustering Methods

In [304]:
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as shc

In [None]:
def agglomerative_cluster(X, n_clusters, with_model=False):
    labels = shc.fcluster(shc.linkage(X, method='ward'), n_clusters, criterion='maxclust') - 1
    
    # for consistency between clusters
    if with_model:
        return labels, None
    
    return labels

def k_means_cluster(X, n_clusters, random_state=42, with_model=False, **kwargs):
    km_model = KMeans(n_clusters=n_clusters, random_state=random_state, n_jobs=-1, n_init=50, **kwargs)
    km_model.fit(X)
    
    if with_model:
        return km_model.labels_, km_model

    return km_model.labels_

def nmf_cluster(X, n_clusters, frobenius=True, random_state=42, with_model=False, **kwargs):
    if frobenius:
        nmf = NMF(n_components=n_clusters, random_state=random_state, shuffle=True, alpha=.1, **kwargs)
    else:
        nmf = NMF(n_components=n_clusters, random_state=random_state, shuffle=True, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, **kwargs)

    H = nmf.fit_transform(X)
    W = nmf.components_
    
    cluster_labels = H.argmax(axis=1)
    
    if with_model:
        return cluster_labels, nmf
    
    return cluster_labels
    

cluster_name_map = {
    agglomerative_cluster: "agglomerative",
    k_means_cluster: "k_means",
    nmf_cluster: "nmf"
}

## Cluster Themes

### Tf-idf Transformer Scores

In [None]:
def sum_tfidf_scores(dct, corpus, n=None):
    total_score = {}
    
    for document in corpus:
        sorted_doc = sorted(document, key=(lambda x: (x[1], x[0])), reverse=True)
        for kwd_id, tfidf_score in sorted_doc[:n]:
            kwd = dct[kwd_id]
            
            if kwd not in total_score:
                total_score[kwd] = 0
                
            total_score[kwd] += tfidf_score
            
    return sorted(total_score.items(), key=(lambda x: (x[1], x[0])), reverse=True)

def tfidf_transformer_keyword_scores(clusters, word_count=6, debug=False, exclude_words=[], abstracts=preprocessed_abstracts, **kwds):
    dct = corpora.Dictionary(abstracts)
    model = TfIdfTransformer(dictionary=dct)

    # train model on all documents
    all_docs_corpus = abstracts.apply(dct.doc2bow).tolist()
    model.fit(all_docs_corpus)

    # create corpus per cluster
    cluster_corpus = abstracts.groupby(clusters).apply(lambda x: [dct.doc2bow(abstract) for abstract in x])

    result = []

    for cluster_id, corpus in cluster_corpus.items():
        tfidf_corpus = model.transform(corpus)

        words_in_cluster = 0
        for keyword, score in sum_tfidf_scores(dct, tfidf_corpus):
            if keyword in exclude_words: continue
            result.append((cluster_id, keyword, score))
            words_in_cluster += 1
            
            if words_in_cluster == word_count:
                break

    return pd.DataFrame(result, columns=["cluster", "keyword", "score"])

def plot_keyword_scores(X_scores, n_clusters, name, keyword_algo=tfidf_transformer_keyword_scores, config_name="default", keyword_algo_name=None, savefig=True):
    fig, axes = plt.subplots(nrows=n_clusters, sharey=True)
    
    if keyword_algo_name is None:
        keyword_algo_name = keyword_name_map[keyword_algo]
        
    fig.suptitle(f"Cluster Themes (k={n_clusters}) - {name} - {keyword_algo_name} - {config_name}", fontsize=35)
    fig.set_figheight(n_clusters * 5)
    
    for i, ax in enumerate(axes):
        cluster_num = i
        
        sns.barplot(
            x='keyword',  
            y='score',  
            data=X_scores[X_scores['cluster'] == cluster_num],
            ax=ax
        )

        ax.set_title(f"cluster={cluster_num}")
        ax.set_xlabel(None)
        ax.tick_params(axis='x', labelsize=40)
    
    if savefig:
        fig.savefig(f'plots/{name}/{config_name}/{n_clusters}/keywords_{keyword_algo_name}.pdf', format='pdf')

    plt.show()

In [None]:
keyword_name_map = {
    tfidf_transformer_keyword_scores: "tfidf_transformer",
}

## Cluster Evaluator 

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabasz_score, davies_bouldin_score, pairwise_distances_argmin_min
from gensim import corpora
from gensim.sklearn_api import TfIdfTransformer
from gensim.summarization import keywords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from PyPDF2 import PdfFileMerger

class ClusterEvaluator:
    def __init__(self, X, name, model=None, cluster_algo=k_means_cluster, config_name="default", abstracts=preprocessed_abstracts):
        self.X = X
        self.model = model
        self.name = name
        self.config_name = config_name
        
        self.cluster_labels = {}
        self.cluster_models = {}
        
        self.cluster_algo = cluster_algo
        self.abstracts = abstracts
        
        self.stem_to_token = build_stem_to_token(abstracts.apply(" ".join).iteritems())
        
    def get_cluster_labels(self, k):
        if k not in self.cluster_labels.keys():
            self.cluster_labels[k], self.cluster_models[k] = self.cluster_algo(self.X, k, with_model=True)
        
        return self.cluster_labels[k]
    
    def calinski_harabasz_evaluation(self, n_clusters_limit, **kwargs):
        self.univariate_method_evaluation(n_clusters_limit, calinski_harabasz_score, "calinski_harabasz", **kwargs)

    def davies_bouldin_evaluation(self, n_clusters_limit, **kwargs):
        self.univariate_method_evaluation(n_clusters_limit, davies_bouldin_score, "davies_bouldin", **kwargs)

    def average_silhouette_evaluation(self, n_clusters_limit, **kwargs):
        self.univariate_method_evaluation(n_clusters_limit, silhouette_score, "silhouette", **kwargs)
        
    def univariate_method_evaluation(self, n_clusters_limit, score_metric, score_metric_name, savefig=True, **kwargs):
        assert n_clusters_limit >= 2

        k_range = range(2, n_clusters_limit)

        plt.title(f'{score_metric_name} Scores (max_k={n_clusters_limit})')

        if self.cluster_algo == agglomerative_cluster:
            X_n = spatial.distance.squareform(self.X)
        else:
            X_n = self.X.toarray()

        scores = []
        for k in k_range:
            cluster_labels = self.get_cluster_labels(k)

            scores.append(score_metric(X_n, cluster_labels))

        plt.plot(k_range, scores, label=self.config_name, **kwargs)

        plt.legend()
        
        if savefig:
            plt.savefig(f'plots/{self.name}/{self.config_name}/{score_metric_name}_scores.pdf')
            
            plt.show()
            
    def silhouette(self, k):
        fig, ax = plt.subplots()

        if self.cluster_algo == agglomerative_cluster:
            X = spatial.distance.squareform(self.X)
        else:
            X = self.X
            
        cluster_labels = self.get_cluster_labels(k)
    
        silhouette_avg = silhouette_score(X, cluster_labels, sample_size=None)
        samples = silhouette_samples(X, cluster_labels)
    
        max_silhouette_score = np.max(samples)

        y_lower = 10
        for i in range(0, k):
            cluster_silhouette_scores = samples[cluster_labels == i]
            cluster_silhouette_scores.sort()
            cluster_size = cluster_silhouette_scores.shape[0]
            
            y_upper = y_lower + cluster_size

            ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_scores)
            ax.text(-0.1 * max_silhouette_score, y_lower + 0.5 * cluster_size, str(i))

            y_lower = y_upper + 10  # 10 for the 0 samples

        plt.title(f'Silhouette Graph (k={k}) - {self.name} - {self.config_name}')

        ax.set_yticks([])
        ax.axvline(x=silhouette_avg, color="red", linestyle="--")
        plt.text(silhouette_avg + 0.01,20,f'silhouette_avg={np.round(silhouette_avg, 4)}')
        plt.savefig(f'plots/{self.name}/{self.config_name}/{k}/silhouette.pdf')
        plt.show()
        
    def tsne(self, k, random_state=0, **kwargs):
        X = self.X
        cluster_labels = self.get_cluster_labels(k)
        
        X_truncated = TruncatedSVD(n_components=k, random_state=random_state).fit_transform(X)
        X_repr = TSNE(n_components=2, random_state=random_state, **kwargs).fit_transform(X_truncated)

        fig = plt.subplot()
        sns.scatterplot(X_repr[:, 0], X_repr[:, 1], s=1000, hue=cluster_labels, palette="Set3", legend="full")
        plt.title(f'TSNE 2-d Representation (k={k}) - {self.name} - {self.config_name}')
        plt.savefig(f'plots/{self.name}/{self.config_name}/{k}/tsne.pdf')
        plt.show()
        
    def format_keyword(self, keyword):
        return " ".join([self.stem_to_token.get(w, w) for w in keyword.split()])
        
    def k_means_centers_nearest_papers(self, k, df):
        assert self.cluster_algo == k_means_cluster

        cluster_labels = self.get_cluster_labels(k)
        kmeans = self.cluster_models[k]
        
        if kmeans is None:
            raise Exception(f"Cluster model for {k} does not exist.")
            
        nearest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, self.X)
        
        nearest_papers_df = df.iloc[nearest].reset_index(drop=True)
        nearest_papers_df.to_csv(f"./plots/{self.name}/{self.config_name}/{k}/most_representative_papers.csv")
        
    def k_means_centers_keywords_scores(self, k, word_count=6):
        assert self.cluster_algo == k_means_cluster

        cluster_labels = self.get_cluster_labels(k)
        kmeans = self.cluster_models[k]

        if kmeans is None:
            raise Exception(f"Cluster model for {k} does not exist.")

        cluster_center_feature_names_df = pd.DataFrame(kmeans.cluster_centers_.transpose(), index=self.model.get_feature_names())
        non_zero_mask = ~cluster_center_feature_names_df.eq(0)

        result = [] 

        for label in cluster_labels:
            cluster_df = cluster_center_feature_names_df[non_zero_mask][label].dropna()

            for keyword, scores in cluster_df.sort_values(ascending=False).iloc[:word_count].iteritems():
                result.append((label, self.format_keyword(keyword), scores))

        X_scores = pd.DataFrame(result, columns=["cluster", "keyword", "score"])
        
        plot_keyword_scores(X_scores, k, self.name, config_name=self.config_name, keyword_algo_name="k_means_centers")
            
    def keywords(self, k, keyword_algo=tfidf_transformer_keyword_scores, savefig=True, **kwds):
        cluster_labels = self.get_cluster_labels(k)
        
        X_scores = keyword_algo(cluster_labels, abstracts=self.abstracts, stem_to_token=self.stem_to_token, **kwds)
        plot_keyword_scores(X_scores, k, self.name, keyword_algo, self.config_name, savefig=savefig)
        
    def nmf_decomp(self, frobenius=True, n_topics=10, word_count=10, random_state=42, **kwargs):
        if frobenius:
            # NMF - Frobenius
            nmf = NMF(n_components=n_topics, random_state=random_state, shuffle=True, alpha=.1)
        else:
            # NMF - Kullback-leiber
            nmf = NMF(n_components=n_topics, random_state=random_state, shuffle=True, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5)
            
        H = nmf.fit_transform(self.X)
        W = nmf.components_
        
        return W, H
        
        feature_names = self.model.get_feature_names()

        result = []
        
        for topic_id, topic in enumerate(H):
            topic_top_words = topic.argsort()[::-1][:word_count]
            
            result.append(', '.join([self.format_keyword(feature_names[i]) for i in topic_top_words]))
            
        return result
        
    def merge(self, pdfnames):
        for pdfname in pdfnames:
            merger = PdfFileMerger()
            
            for k in sorted(self.cluster_labels.keys()):
                try:
                    pdf_path = f"./plots/{self.name}/{self.config_name}/{k}/{pdfname}.pdf"
                    merger.append(pdf_path)
                except FileNotFoundError:
                    continue
                

            merger.write(f"./plots/{self.name}/{self.config_name}/all_{pdfname}.pdf")
            merger.close()
            

# Cluster Methods

## Tf-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

if "tfidf" not in os.listdir("./plots"):
    os.mkdir("./plots/tfidf")

### Unigram

In [None]:
unigram_model = TfidfVectorizer(
    ngram_range=(1, 1), 
    tokenizer=tokenizer, 
    min_df=2, # must appear in more than one document
    max_df=0.99 # must appear in fewer than 99% of documents 
)

config_name = "unigram"

unigram_scores = unigram_model.fit_transform(preprocessed_abstracts.apply(" ".join).values)
unigram_evaluator = ClusterEvaluator(unigram_scores, "tfidf", model=unigram_model, config_name=config_name)

if config_name not in os.listdir("./plots/tfidf"):
    os.mkdir(f"./plots/tfidf/{config_name}")

In [None]:
uW, uH = unigram_evaluator.nmf_decomp(n_topics=20)

In [None]:
nmf_cluster_labels = uH.argmax(axis=1)

In [None]:
reference_preprocessing(df[nmf_cluster_labels == 10]).tolist()

In [None]:
unigram_evaluator.average_silhouette_evaluation(30)
unigram_evaluator.calinski_harabasz_evaluation(30)
unigram_evaluator.davies_bouldin_evaluation(30)

In [None]:
for k in [3, 6, 9, 12, 18]:
    if str(k) not in os.listdir("./plots/tfidf/" + "unigram"):
        os.mkdir(f"./plots/tfidf/unigram/{k}")
        
    unigram_evaluator.silhouette(k)
    unigram_evaluator.tsne(k)
    unigram_evaluator.keywords(k, keyword_algo=tfidf_vectorizer_keyword_scores)
    unigram_evaluator.keywords(k, keyword_algo=tfidf_transformer_keyword_scores)
    unigram_evaluator.k_means_centers_keywords_scores(k)
    unigram_evaluator.k_means_centers_nearest_papers(k, df)

In [None]:
unigram_evaluator.merge(["silhouette", "tsne", "keywords_tfidf_transformer", "keywords_k_means_centers", "keywords_tfidf_vectorizer"])

### Bigrams

In [None]:
bigram_model = TfidfVectorizer(
    ngram_range=(1, 2), # search for unigrams and bigrams
    tokenizer=tokenizer,
    min_df=3,
    max_df=0.95
)

config_name = "bigram"

bigram_scores = bigram_model.fit_transform(preprocessed_abstracts.apply(" ".join).values)
bigram_evaluator = ClusterEvaluator(bigram_scores, "tfidf", model=bigram_model, config_name=config_name)

if config_name not in os.listdir("./plots/tfidf"):
    os.mkdir("./plots/tfidf/" + config_name)

In [None]:
bigram_evaluator.average_silhouette_evaluation(30)
bigram_evaluator.calinski_harabasz_evaluation(30)
bigram_evaluator.davies_bouldin_evaluation(30)

In [None]:
for k in [6, 8, 9, 12, 15, 19]:
    if str(k) not in os.listdir(f"./plots/tfidf/{bigram_evaluator.config_name}"):
        os.mkdir(f"./plots/tfidf/{bigram_evaluator.config_name}/{k}")
        
    bigram_evaluator.silhouette(k)
    bigram_evaluator.tsne(k)
    bigram_evaluator.keywords(k, keyword_algo=tfidf_transformer_keyword_scores)
    bigram_evaluator.keywords(k, keyword_algo=tfidf_vectorizer_keyword_scores)
    bigram_evaluator.k_means_centers_keywords_scores(k)
    bigram_evaluator.k_means_centers_nearest_papers(k, df)

In [None]:
bigram_evaluator.print_nmf_topics(n_topics=20, word_count=6)

In [None]:
bigram_evaluator.merge(["silhouette", "tsne", "keywords_tfidf_transformer", "keywords_k_means_centers", "keywords_tfidf_vectorizer"])

### Multigrams

In [None]:
multigram_model = TfidfVectorizer(
    ngram_range=(1, 5),
    tokenizer=tokenizer,
    min_df=3,
    max_df=0.95
)

multigram_scores = multigram_model.fit_transform(preprocessed_abstracts.apply(" ".join).values)
multigram_evaluator = ClusterEvaluator(multigram_scores, "tfidf", model=multigram_model, config_name=f"multigram")

In [None]:
multigram_evaluator.average_silhouette_evaluation(30)
multigram_evaluator.calinski_harabasz_evaluation(30)
multigram_evaluator.davies_bouldin_evaluation(30)

In [None]:
if "multigram" not in os.listdir("./plots/tfidf"):
    os.mkdir("./plots/tfidf/multigram")
for k in [8, 11, 12, 15]:
    if str(k) not in os.listdir("./plots/tfidf/multigram"):
        os.mkdir(f"./plots/tfidf/multigram/{k}")
        
#     multigram_evaluator.silhouette(k)
#     multigram_evaluator.tsne(k)
#     multigram_evaluator.keywords(k, keyword_algo=tfidf_transformer_keyword_scores)
#     multigram_evaluator.keywords(k, keyword_algo=tfidf_vectorizer_keyword_scores)
#     multigram_evaluator.k_means_centers_keywords_scores(k)
    multigram_evaluator.k_means_centers_nearest_papers(k, df)

In [None]:
multigram_evaluator.print_nmf_topics()

In [None]:
multigram_evaluator.merge(["silhouette", "tsne", "keywords_tfidf_transformer", "keywords_k_means_centers", "keywords_tfidf_vectorizer"])

### NMF

In [None]:
n_topics = 20
nmf_df = pd.DataFrame(index=range(n_topics))
nmf_df["unigram"] = unigram_evaluator.nmf_topics(n_topics=n_topics, word_count=6)
nmf_df["bigram"] = bigram_evaluator.nmf_topics(n_topics=n_topics, word_count=6)
nmf_df["multigram"] = multigram_evaluator.nmf_topics(n_topics=n_topics, word_count=6)

nmf_df.to_csv(f"results/nmf_topics_{n_topics}.csv")

### Save Clusters

In [None]:
tfidf_df = df.copy()

tfidf_df[f"unigram_6"] = unigram_evaluator.get_cluster_labels(6)
tfidf_df[f"unigram_9"] = unigram_evaluator.get_cluster_labels(9)
tfidf_df[f"unigram_12"] = unigram_evaluator.get_cluster_labels(12)

tfidf_df[f"bigram_8"] = bigram_evaluator.get_cluster_labels(8)
tfidf_df[f"bigram_9"] = bigram_evaluator.get_cluster_labels(9)
tfidf_df[f"bigram_12"] = bigram_evaluator.get_cluster_labels(12)
tfidf_df[f"bigram_15"] = bigram_evaluator.get_cluster_labels(15)
tfidf_df[f"bigram_19"] = bigram_evaluator.get_cluster_labels(19)

In [None]:
tfidf_df.to_csv('results/tfidf_clusters_ngrams.csv', index=False)

## Word Mover's Distance on GloVe Embeddings

In [None]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile

from scipy import spatial

In [None]:
def convert_glove_2_w2v():
    tmp_file = get_tmpfile("test_word2vec.txt")

    glove2word2vec("./data/glove.6B/glove.6B.50d.txt", tmp_file)

    return KeyedVectors.load_word2vec_format(tmp_file)
    
# model = convert_glove_2_w2v()

In [None]:
def calculate_distance_matrix():
    s = preprocessed_abstracts.size
    result = np.ndarray((s,s))
    
    for i, a1 in enumerate(preprocessed_abstracts):
        print(f"({i}/{s}) Calculating distance for abstract...")
        for j, a2 in enumerate(preprocessed_abstracts.iloc[i:]):
            distance = model.wmdistance(a1, a2)
            result[i][i + j] = distance
            result[i + j][i] = distance
            print(f"D_({i}, {i + j})={distance}")
            
    return result

# abstract_distance_matrix = calculate_distance_matrix()

In [None]:
# store distance matrix
# pd.DataFrame(abstract_distance_matrix).to_csv('data/last_distance_matrix.csv')
# load distance matrix from memory
abstract_distance_matrix = pd.read_csv('data/last_distance_matrix.csv', index_col=0).values
# create the squareform
abstract_df = spatial.distance.squareform(abstract_distance_matrix)

In [None]:
wmd_evaluator = ClusterEvaluator(abstract_df, "wmd_glove", config_name=f"last", cluster_algo=agglomerative_cluster)

if "last" not in os.listdir("./plots/wmd_glove"):
    os.mkdir(f"./plots/wmd_glove/last")

### Dendogram

In [None]:
plt.title("All Years - Second Exlusion - Dendrogram")
dend = shc.dendrogram(shc.linkage(abstract_df, method='ward'))
plt.savefig('plots/wmd_glove/dendrogram.pdf', format='pdf')

### Evaluation

In [None]:
for i in range(2, 12):
    if str(i) not in os.listdir("./plots/wmd_glove/last"):
        os.mkdir(f"./plots/wmd_glove/last/{i}")
        
    wmd_evaluator.silhouette(i)
    wmd_evaluator.keywords(i, keyword_algo=tfidf_transformer_keyword_scores)
    wmd_evaluator.keywords(i, keyword_algo=tfidf_vectorizer_keyword_scores)

In [None]:
wmd_evaluator.average_silhouette_evaluation(20)
wmd_evaluator.calinski_harabasz_evaluation(20)
wmd_evaluator.davies_bouldin_evaluation(20)

In [None]:
wmd_evaluator.merge(["silhouette", "keywords_tfidf_transformer", "keywords_tfidf_vectorizer"])

### Save Clusters

In [None]:
wmd_df = df.copy()

for k in range(2, 6):
    wmd_df[f"cluster_{k}"] = wmd_evaluator.get_cluster_labels(k)
    
wmd_df.to_csv("results/wmd_glove_clusters.csv", index=False)

## Alternatives

### Topic Modeling

It seems that there is not enough data available in our dataset (only ~300 paragraphs) to provide interesting results for topic modeling algorithms.

### People-Centric - TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
people_centric_df = pd.read_csv("./archive/people_centric/data/abstracts.csv", index_col=0)

In [None]:
people_centric_abstracts = abstract_preprocessing(people_centric_df, dfname="people_centric")

In [None]:
people_centric_model = TfidfVectorizer(
    ngram_range=(1, 1),
    tokenizer=tokenizer
)
people_centric_scores = people_centric_model.fit_transform(people_centric_abstracts.apply(" ".join).values)
people_centric_evaluator = ClusterEvaluator(people_centric_scores, "people_centric", config_name=f"unigram", abstracts=people_centric_abstracts)

In [None]:
for i in range(2, 12):
    if str(i) not in os.listdir("./plots/people_centric/unigram"):
        os.mkdir(f"./plots/people_centric/unigram/{i}")
    
    people_centric_evaluator.silhouette(i)
    people_centric_evaluator.tsne(i)
    people_centric_evaluator.keywords(i, keyword_algo=tfidf_transformer_keyword_scores)
    people_centric_evaluator.keywords(i, keyword_algo=tfidf_vectorizer_keyword_scores)
    people_centric_evaluator.k_means_centers_keywords_scores(people_centric_model, i)

In [None]:
people_centric_evaluator.average_silhouette_evaluation(20)
people_centric_evaluator.calinski_harabasz_evaluation(20)
people_centric_evaluator.davies_bouldin_evaluation(20)

In [None]:
people_centric_evaluator.merge(["silhouette", "tsne", "keywords_tfidf_transformer", "keywords_tfidf_vectorizer", "keywords_k_means_centers"])

In [None]:
people_centric_tfidf_df = people_centric_df.copy()

for k in range(5, 9):
    people_centric_tfidf_df[f"cluster_{k}"] = people_centric_evaluator.get_cluster_labels(k)
    
people_centric_tfidf_df.to_csv("./results/people_centric_tfidf_clusters.csv", index=False)

### People-Centric - WMD

In [None]:
people_centric_abstract_df = pd.read_csv("./archive/people_centric/data/whole_abstract_distance_matrix.csv", index_col=0)

In [None]:
people_centric_wmd_evaluator = ClusterEvaluator(people_centric_abstract_df, "people_centric", config_name=f"wmd_glove", cluster_algo=agglomerative_cluster)

if "wmd_glove" not in os.listdir("./plots/people_centric"):
    os.mkdir(f"./plots/people_centric/wmd_glove")

In [None]:
people_centric_wmd_df = people_centric_df.copy()

for k in range(5, 9):
    people_centric_wmd_df[f"cluster_{k}"] = people_centric_wmd_evaluator.get_cluster_labels(k) 
    

people_centric_wmd_df.to_csv("./results/people_centric_wmd_clusters.csv", index=False)

# Archive

The code below has been moved into the Evaluator class or has now become obselete.

### Silhouette

In [None]:
def silhouette_evaluation(X, n_clusters, name, cluster_algo=k_means_cluster, config_name="default", squareform=False):
    fig, ax = plt.subplots()

    cluster_labels = cluster_algo(X, n_clusters)
    
    if squareform:
        X = spatial.distance.squareform(X)
    
    silhouette_avg = silhouette_score(X, cluster_labels, sample_size=None)
    samples = silhouette_samples(X, cluster_labels)
    
    max_silhouette_score = np.max(samples)

    y_lower = 10
    for i in range(0, n_clusters):
        cluster_silhouette_scores = samples[cluster_labels == i]
        cluster_silhouette_scores.sort()

        cluster_size = cluster_silhouette_scores.shape[0]
        y_upper = y_lower + cluster_size

        ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_scores)

        ax.text(-0.1 * max_silhouette_score, y_lower + 0.5 * cluster_size, str(i))

        y_lower = y_upper + 10  # 10 for the 0 samples

    plt.title(f'Silhouette Graph (k={n_clusters}) - {name} - {config_name}')

    ax.set_yticks([])
    ax.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.text(silhouette_avg + 0.01,20,f'silhouette_avg={np.round(silhouette_avg, 4)}')
    plt.savefig(f'plots/{name}/{n_clusters}/silhouette_{config_name}.pdf')
    plt.show()

#### TSNE

In [None]:
def tsne_evaluation(X, n_clusters, name, random_state=0, config_name="default", **kwargs):
    labels = k_means_cluster(X, n_clusters)
    X_truncated = TruncatedSVD(n_components=n_clusters, random_state=random_state).fit_transform(X)
    X_repr = TSNE(n_components=2, random_state=random_state, **kwargs).fit_transform(X_truncated)

    fig = plt.subplot()
    sns.scatterplot(X_repr[:, 0], X_repr[:, 1], s=1000, hue=labels, palette="Set3", legend="full")
    plt.title(f'TSNE 2-d Representation (k={n_clusters}) - {name} - {config_name}')

    plt.savefig(f'plots/{name}/{n_clusters}/tsne_{config_name}.pdf')
    plt.show()

### Tf-idf Vectorizer Scores

In [None]:
tfidf_vectorizer_keyword_scores_cache = {}


def tfidf_vectorizer_keyword_scores(clusters, word_count=6, abstracts=preprocessed_abstracts, stem_to_token=None, **kwargs):
    result = []
    
    # join abstract strings 
    abstract_strings = abstracts.apply(" ".join)
    
    # get or store vectorizer model in cache
    if abstracts.name not in tfidf_vectorizer_keyword_scores_cache:
        model = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 2), max_features=5000, **kwargs)
        model.fit(abstract_strings.values)
        
        tfidf_vectorizer_keyword_scores_cache[abstracts.name] = model
    else:
        model = tfidf_vectorizer_keyword_scores_cache[abstracts.name]
        
    general_idf_ = pd.Series(model.idf_, index=model.get_feature_names())
        
    # for each cluster
    for cluster_id, cluster_idx in abstracts.groupby(clusters).groups.items():
        # get the associated abstracts
        abstracts = abstract_strings.loc[cluster_idx]
        
        abstracts_scores = model.transform(abstracts.values)
        
        # put them in a dataframe where each word vector is a row
        abstracts_scores_df = pd.DataFrame.sparse.from_spmatrix(abstracts_scores.transpose(), index=model.get_feature_names())
        
        # remove all keywords that don't appear in the cluster for performance
        keyword_in_cluster_mask = ~abstracts_scores_df.sum(1).eq(0)
        abstracts_scores_df = abstracts_scores_df[keyword_in_cluster_mask]
        
        # accumulate scores based on the global idf
        abstracts_scores_df = abstracts_scores_df.sum(1).mul(general_idf_[keyword_in_cluster_mask], axis=0).dropna()

        # sort the dataframe by the cumulative idf scores
        sorted_keyword_scores = abstracts_scores_df.sort_values(ascending=False)
        
        for keyword, score in sorted_keyword_scores.iloc[:word_count].iteritems():
            if stem_to_token is not None:
                keyword =  " ".join([stem_to_token.get(w, w) for w in keyword.split()])
            
            result.append((cluster_id, keyword, score))
        
    return pd.DataFrame(result, columns=["cluster", "keyword", "score"])

### TextRank

In [None]:
def stemmed_keyword_mean_score(keywords):
    stemmed_scores = {}
    stemmed2kwd = {}
    
    for kwd, score in keywords:
        stemmed_kwd = stem_text(stem_text(kwd))
        
        if stemmed_kwd not in stemmed_scores:
            stemmed2kwd[stemmed_kwd] = kwd
            stemmed_scores[stemmed_kwd] = []

        stemmed_scores[stemmed_kwd].append(score)
        
    stemmed_scores = {stemmed2kwd[k]: np.mean(v) for (k, v) in stemmed_scores.items()} 
            
    return sorted(stemmed_scores.items(), key=(lambda x: (x[1], x[0])), reverse=True)

def text_rank_keyword_scores(clusters, word_count=6, debug=False, abstracts=preprocessed_abstracts, **kwds):
    result = []
    
    scored_keywords = abstracts.apply(" ".join)\
                               .apply(lambda s: re.sub(u"–", " ", s))\
                               .groupby(clusters)\
                               .apply(". ".join)\
                               .apply(lambda x: keywords(x, scores=True, **kwds))
    

    for group, kw_list in scored_keywords.iteritems():
        stemmed_keywords_seen = set([])
        
        for keyword, score in kw_list:
            if len(stemmed_keywords_seen) == word_count:
                break
                
            stemmed_keyword = stem_text(stem_text(keyword))
            
            if debug:
                print(", ".join([keyword, stemmed_keyword, str(score), str(stemmed_keywords_seen)]))
            
            if stemmed_keyword not in stemmed_keywords_seen:
                stemmed_keywords_seen.add(stemmed_keyword)
                result.append((group, keyword, score))
            
    return pd.DataFrame(result, columns=["cluster", "keyword", "score"])

### IDF Scores

In [None]:
unigram_idf_scores = dict(filter(lambda x: len(x[0].split()) == 1, zip(bigram_model.get_feature_names(), bigram_model.idf_)))
bigram_idf_scores = dict(filter(lambda x: len(x[0].split()) == 2, zip(bigram_model.get_feature_names(), bigram_model.idf_)))

In [None]:
ax = plt.gca()
pd.Series(unigram_idf_scores).plot.hist(bins=100, ax=ax)
pd.Series(bigram_idf_scores).plot.hist(bins=100, ax=ax)