In [3]:
import os
import pandas as pd
import collections
import pickle
#import cPickle
import _pickle as cPickle

import scipy
from scipy import sparse
import numpy as np

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import numpy as np
import random

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

import nltk
import nltk.data
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize,sent_tokenize

import re
import string
import time
import codecs
from Bio import Medline

import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec, Word2Vec, CoherenceModel
from gensim.summarization import summarize

from collections import defaultdict

import pyLDAvis.gensim

import GapStatistics
import time

import bokeh

# Configurations and Variable Definitions

In [51]:
# Configuration: Data file name (it should be placed in the same ditrectory as the notebook file)
DATA_FILE = DATA_FILE = '/data/pubmedtext.txt'
#test variables
# test_pmid = '27179337'
test_pmid = '28324318'

In [52]:
# constant variables
CLEANED_ABSTRACT_COL = 'cleaned_abstract'
SUMMARY_ABSTRACT_COL = 'summary_abstract'
CLEANED_MINIMAL_ABSTRACT_COL = 'cleaned_minimal_abstract'
TOKENIZED_CLEAN_ABS_COL = 'tokenized_c_abstract'
TOKENIZED_RAW_ABS_COL = 'tokenized_r_abstract'
TOKEN_COUNT_RAW_ABS_COL = 'token_count_r_abstract'
TOKEN_COUNT_CLEAN_ABS_COL = 'token_count_c_abstract'
TOKEN_COUNT_Unq_CLEAN_ABS_COL = 'token_count_unq_c_abstract'

# Input File Processing

In [59]:
doc2author = dict()
doc_id_dict = dict()
topics=[]

In [60]:
#Dictionary of medline terms: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
#Note that this differs from above by removing grants and affiliations in an effort to minimize exceptions
def read_medline_data_raw(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["pmid", "articletitle", "journaltitle", "abstract"])
    count = 0
    for rec in recs:
        try:
            pmid = rec['PMID']
            atitle = rec["TI"]
            jtitle = rec["JT"]
            topics.append(jtitle)
            pubdate = rec["DP"]
            abstr = rec["AB"]
            a_name=rec["AU"]
            doc2author[count]=a_name
            doc_id_dict[count]=count
            count=count+1
            text = text.append(pd.DataFrame([[pmid,atitle, jtitle,pubdate, abstr]],
                columns=["pmid", "articletitle", "journaltitle", "pubdate", "abstract"]),ignore_index=True, verify_integrity=True)            
        except:
            pass
    text.set_index(text.pmid)
    return text

In [61]:
# get the Data file path

print("Cell started at: " + time.strftime("%c"))
try:
    approot = os.path.dirname(os.path.realpath('__file__'))
except NameError:  # if it is the main script, not a module
    import sys
    approot = os.path.dirname(os.path.abspath(sys.argv[0]))
DATA_PATH = approot + DATA_FILE

# Read in MEDLINE formatted text
%time papers = read_medline_data_raw(DATA_PATH)
papers.to_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update')
papers.to_csv('data/papers_pain_EngFilter_6_5_2017_raw_oct17update.csv')
print("Cell completed at: " + time.strftime("%c"))

Cell started at: Fri May 18 09:22:16 2018
CPU times: user 116 ms, sys: 0 ns, total: 116 ms
Wall time: 110 ms
Cell completed at: Fri May 18 09:22:16 2018


In [63]:
topics=list(set(topics))
topics2=topics[1:11]
topics2

['The international journal of neuropsychopharmacology',
 'Expert opinion on pharmacotherapy',
 'Health psychology : official journal of the Division of Health Psychology, American Psychological Association',
 'Foot & ankle international',
 'Expert review of clinical pharmacology',
 'Haematologica',
 'Head & neck',
 'European journal of radiology',
 'Experimental brain research',
 'Human & experimental toxicology']

In [64]:
papers = pd.read_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update')

In [65]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 5 columns):
abstract        61 non-null object
articletitle    61 non-null object
journaltitle    61 non-null object
pmid            61 non-null object
pubdate         61 non-null object
dtypes: object(5)
memory usage: 2.5+ KB


In [66]:
pubdatevalues = papers.pubdate.unique()
#pubdatevalues.tofile("pubdatevalues.csv", sep=',') Examine range of pubdates manually in csv

In [67]:
papers['pubdate_dtformat'] = pd.to_datetime(papers.pubdate, format='%Y', exact=False)
papers['pubyear'] = pd.DatetimeIndex(papers['pubdate_dtformat']).year
papers=papers.drop('pubdate_dtformat', axis=1)

In [68]:
papers.tail(10)

Unnamed: 0,abstract,articletitle,journaltitle,pmid,pubdate,pubyear
51,Non-steroidal anti-inflammatory drugs (NSAIDs)...,Anti-inflammatory and antinociceptive activiti...,International immunopharmacology,21855654,2011 Nov,2011
52,We investigated the changes in characteristics...,Expression of inflammatory and apoptosis facto...,International immunopharmacology,21821152,2011 Nov,2011
53,"INTRODUCTION: Flupirtine, a nonopioid analgesi...",Efficacy and tolerability of flupirtine in sub...,International journal of clinical pharmacology...,22011688,2011 Nov,2011
54,Post-herpetic neuralgia means pain which occur...,Modified Jaipur block for the treatment of pos...,International journal of dermatology,22004501,2011 Nov,2011
55,Antipsychotic drugs are the clinical standard ...,Dynamic regulation of dopamine and serotonin r...,The international journal of neuropsychopharma...,21281560,2011 Nov,2011
56,"BACKGROUND: Sex, race, and age disparities in ...",Patient demographic characteristics and facial...,International journal of nursing studies,21596378,2011 Nov,2011
57,BACKGROUND: Even though the use of a 25 gauge ...,Comparison of post-dural puncture headache and...,International journal of nursing studies,21561619,2011 Nov,2011
58,PURPOSE: We analyzed variables associated with...,Predictors of long-term toxicity using three-d...,"International journal of radiation oncology, b...",20933342,2011 Nov 01,2011
59,PURPOSE: To correlate computed tomography (CT)...,Correlation of computed tomography imaging fea...,"International journal of radiation oncology, b...",20889265,2011 Nov 01,2011
60,Rheumatoid arthritis (RA) is one of the inflam...,The antinociceptive efficacy of HWTX-I epidura...,International journal of sports medicine,22052031,2011 Nov,2011


In [70]:
papers.journaltitle.nunique()

39

In [71]:
papers.to_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update_Mar18year')
papers = pd.read_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update_Mar18year')

# Text Cleanup & Tokenization

## Remove "all-caps:" section headers, punctuation marks, numbers, and option for words in all capital letters

In [121]:
# This function will use regular expressions to remove all:
#  (1) words/phrases in all caps followed by :, (if filter_all_caps = True, eveything all-caps will be removed)     
#  (2) numbers
#  (3) one letter words (such as n or p or other mathematical symbols)
#  (4) punctuation marks
# return value is the list of all filtered words (except for numbers), and the clean abstracts

# A utility function for flattening the lists
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str):
            for sub in flatten(el):
                yield sub
        else:
            yield el

# Filter text as described above            
def filterText(abstracts, filter_all_caps = False):    
    #define the regex pattern
    regex = re.compile('[%s]' % re.escape(re.sub('-', '', string.punctuation)))
    if filter_all_caps:
        pattern = regex.pattern + r"|\b[A-Z]{2,}|[0-9]+|\b\w\b|\d-\d|\W-\W|\s-\s" 
    else:
        pattern = regex.pattern + r"|(((\s|^)[A-Z]{2,}(,)*)*)(\s|^)(\b[A-Z]{2,}):|[0-9]+|\b\w\b|\d-\d|\W-\W|\s-\s" 
    
    #The list of all filtered words (except for numbers)
    # convert to a falttened set for faster results
    all_filtered_series = abstracts.apply(lambda d: re.findall(pattern, d))
    all_filtered = set(flatten(all_filtered_series.tolist()))
    non_digit_filtered = [y for y in all_filtered if not y.isdigit()]
    
    #remove whatever needs to be filtered
    abstracts = abstracts.apply(lambda d: re.sub(pattern, '', d))
    abstracts = abstracts.apply(lambda d: re.sub('s-s', '', d))
    
    return abstracts, non_digit_filtered

## Remove stop words

In [76]:
# This function will replace all the stop words, it also gets rid of the whitespaces
def removeWord(text, stop_list):
    new_text = []
    
    #first convert to a list
    text_list = str.split(text)
    
    #check for stop words
    for x in text_list:
        if x not in stop_list:
            new_text.append(x)
    
    #convert back to string
    new_text_str = ' '.join(new_text)
    
    return new_text_str

# This function will remove stop words using NLTK stop word list
# It will return processed abstracts and stop word list
def removeStopWords(abstracts):         
    # use NLTK's default stop word list
    stop_set = set(stopwords.words('english'))
    
    # find all stop words and remove them
    abstracts = abstracts.apply(lambda d: removeWord(d, stop_set))
    
    return abstracts, stop_set

## Convert to lower case, tokenize

In [78]:
# This fuction will convert everything to lower-case
def changeToLowerCase(abstracts):  
    abstracts = abstracts.apply(lambda d: d.lower())
    return abstracts

In [79]:
# This function will tokenize all sentences
def tokenizeAbstracts(abstracts):  
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = abstracts.apply(lambda d: tokenizer.tokenize(d))
    return tokens

## Combine Preprocess Functions

In [81]:
def preprocessAbstracts(abstracts):    
    # Remove all-caps words, numbers, 1-letter words, punctuation marks 
    abstracts, filtered = filterText(abstracts, filter_all_caps= False)
    
    # conevrt to lower-case
    abstracts = changeToLowerCase(abstracts)
    
    # Remove stop words
    abstracts, stop_list = removeStopWords(abstracts)
    filtered = filtered + list(stop_list)
    
    # tokenize
    tokens = tokenizeAbstracts(abstracts)
    
    return abstracts, tokens, filtered

In [122]:
# This can be helpful for word2vec models that need minimal preprocessing
def minimallyPreprocessAbstracts(abstracts):
    #define the regex pattern
    regex = re.compile('[%s]' % re.escape(re.sub('-', '', string.punctuation)))
    pattern = regex.pattern + r"|[0-9.]+" 
    
    #remove whatever needs to be filtered
    abstracts = abstracts.apply(lambda d: re.sub(pattern, '#NUM', d))
    abstracts = abstracts.apply(lambda d: re.sub('s-s', '', d))
    return abstracts

## Apply preprocessing

In [84]:
# preprocess the abstracts
papers[CLEANED_ABSTRACT_COL], papers[TOKENIZED_CLEAN_ABS_COL], filtered = preprocessAbstracts(papers.abstract)
papers[CLEANED_MINIMAL_ABSTRACT_COL] = minimallyPreprocessAbstracts(papers.abstract)
%time papers[TOKENIZED_RAW_ABS_COL] = tokenizeAbstracts(papers.abstract)
# pre-compute the token counts
papers[TOKEN_COUNT_CLEAN_ABS_COL] =papers[TOKENIZED_CLEAN_ABS_COL].map(lambda text: len(text))
papers[TOKEN_COUNT_RAW_ABS_COL] =papers[TOKENIZED_RAW_ABS_COL].map(lambda text: len(text))
papers[TOKEN_COUNT_Unq_CLEAN_ABS_COL] =papers[TOKENIZED_CLEAN_ABS_COL].map(lambda text: len(set(text)))

CPU times: user 3.88 ms, sys: 0 ns, total: 3.88 ms
Wall time: 3.89 ms


In [85]:
#Show list of filtered terms
print(filtered)

['', ' CASE', ' ', 'DESIGN', ' RESULTS AND', 'LIMITATIONS', 'OPINION', 'OBJECTIVES', ' DESIGN AND', 'CONCLUSIONS', ' AND', 'UNLABELLED', 'PURPOSE', ' EXPERT', ' PATIENTS AND', 'PATIENTS', 'METHOD', 'BACKGROUND AND', 'BACKGROUND', 'INTRODUCTION', 'AIMS', 'COVERED', 'AIM', 'OBJECTIVE', ' METHODS AND', ' MATERIALS AND', 'PARTICIPANTS', 'MATERIALS', 'REPORT', 'RESULTS', ' AREAS', 'SETTINGS', 'CONCLUSION', 'METHODS', 'against', 'shouldn', 'down', 'haven', 'each', 'o', 'be', 'can', 'such', 'isn', 've', 'couldn', 'who', 'when', 'what', "didn't", "hasn't", 'me', 'm', 'nor', "don't", 'why', "should've", 'ain', 'if', 'further', 'by', 'the', 'won', "it's", 'whom', 's', 'll', 'it', 'did', 'mightn', 'he', 'more', 'wasn', 'any', 'wouldn', 'as', "aren't", "you've", 'over', 'that', 'were', 'themselves', 'or', 'is', 'weren', 'being', "shan't", 'then', "she's", 'himself', 'hasn', 'don', 'y', 'before', 'mustn', 'up', 'until', 'its', 'same', 'she', 'most', 'out', 'her', 't', 'does', "wouldn't", "you'd", "

In [86]:
papers.to_pickle('data/papapers_pain_EngFilter_6_5_2017_raw_preprocessed_oct17update_Mar18years')
%time papers = pd.read_pickle('data/papapers_pain_EngFilter_6_5_2017_raw_preprocessed_oct17update_Mar18years')

CPU times: user 5.55 ms, sys: 167 µs, total: 5.71 ms
Wall time: 5.74 ms


In [5]:
#papers.head()

In [88]:
#Test
print('Original Abstract: \n', papers.abstract[papers.pmid == test_pmid])
print('\nCleaned Abstract: \n', papers.loc[papers.pmid == test_pmid, CLEANED_ABSTRACT_COL])
print('\nTokenized Abstract: \n', papers.loc[papers.pmid == test_pmid, TOKENIZED_CLEAN_ABS_COL])
print('\nMinimally Processed Abstract: \n', papers.loc[papers.pmid == test_pmid, CLEANED_MINIMAL_ABSTRACT_COL])
papers[(papers.pmid == test_pmid)]

Original Abstract: 
 Series([], Name: abstract, dtype: object)

Cleaned Abstract: 
 Series([], Name: cleaned_abstract, dtype: object)

Tokenized Abstract: 
 Series([], Name: tokenized_c_abstract, dtype: object)

Minimally Processed Abstract: 
 Series([], Name: cleaned_minimal_abstract, dtype: object)


Unnamed: 0,abstract,articletitle,journaltitle,pmid,pubdate,pubyear,cleaned_abstract,tokenized_c_abstract,cleaned_minimal_abstract,tokenized_r_abstract,token_count_c_abstract,token_count_r_abstract,token_count_unq_c_abstract


# Topic Modeling using ATM

## Create Corpus, Dictionary, and Related Text Constructs

In [90]:
# create a dictionary mapping words to ids
abstract_list = papers[TOKENIZED_CLEAN_ABS_COL].values
ca_gs_dictionary = corpora.Dictionary(abstract_list)

#remove extremes (similar to tf-idf)
ca_gs_dictionary.filter_extremes(no_below=1, no_above=0.8)
ca_gs_dictionary.compactify()

In [95]:
#convert the dictionary to a bag of words
ca_gs_corpus = [ca_gs_dictionary.doc2bow(words) for words in abstract_list]
# corpora.mmcorpus.MmCorpus.serialize('data/ca_gensim.mm', corpus)
_ = ca_gs_dictionary[0]  # This sort of "initializes" dictionary.id2token.

In [97]:
from gensim.models import AuthorTopicModel
%time model = AuthorTopicModel(corpus=ca_gs_corpus, num_topics=10, id2word=ca_gs_dictionary.id2token, \
                doc2author=doc2author, chunksize=2000, passes=1, eval_every=0, \
                iterations=1, random_state=1)

CPU times: user 62.5 ms, sys: 0 ns, total: 62.5 ms
Wall time: 46.2 ms


In [99]:
%%time
model_list = []
for i in range(5):
    model = AuthorTopicModel(corpus=ca_gs_corpus, num_topics=10, id2word=ca_gs_dictionary.id2token, \
                    doc2author=doc2author, chunksize=2000, passes=100, gamma_threshold=1e-10, \
                    eval_every=0, iterations=1, random_state=i)
    top_topics = model.top_topics(ca_gs_corpus)
    tc = sum([t[1] for t in top_topics])
    model_list.append((model, tc))

CPU times: user 18.8 s, sys: 113 ms, total: 18.9 s
Wall time: 9.48 s


In [100]:
model, tc = max(model_list, key=lambda x: x[1])
print('Topic coherence: %.3e' %tc)

Topic coherence: -8.402e+01


In [101]:
# Save model.
model.save('/tmp/model.atmodel')

In [102]:
# Load model.
model = AuthorTopicModel.load('/tmp/model.atmodel')

In [103]:
model.show_topic(0)

[('patients', 0.012712839735249984),
 ('health', 0.011901774296122885),
 ('naproxen', 0.0081182757969928375),
 ('care', 0.0073345639440587423),
 ('conditions', 0.0071971177019728976),
 ('patient', 0.0071937039119465392),
 ('physical', 0.0071550196952673605),
 ('present', 0.006842090166583231),
 ('disease', 0.0067002584982109618),
 ('year', 0.0063025249701358149)]

In [104]:
topic_labels=topics2

In [105]:
for topic in model.show_topics(num_topics=10):
    print('Label: ' + topic_labels[topic[0]])
    words = ''
    for word, prob in model.show_topic(topic[0]):
        words += word + ' '
    print('Words: ' + words)
    print()

Label: The international journal of neuropsychopharmacology
Words: patients health naproxen care conditions patient physical present disease year 

Label: Expert opinion on pharmacotherapy
Words: pain induced acute inflammatory anti visual discomfort clinical facet activities 

Label: Health psychology : official journal of the Division of Health Psychology, American Psychological Association
Words: patients ami clinical ct coronary cdb levels patient diagnosis increased 

Label: Foot & ankle international
Words: pain treatment cortex dopamine patients fear headache children pituitary values 

Label: Expert review of clinical pharmacology
Words: pain patients post chronic patient study activated treatment disease age 

Label: Haematologica
Words: pain patients treatment haemophilia surgery quality life underwent review surgical 

Label: Head & neck
Words: patients pain ssc treatment flupirtine efficacy data active skin score 

Label: European journal of radiology
Words: using patients 

In [106]:
model['Stelzeneder D']

[(0, 0.54436973607307071), (1, 0.39202442195244247)]

In [127]:
from pprint import pprint

def show_author(name):
    print('\n%s' % name)
    #print('Docs:', model.author2doc[name])
    print('Journals:')
    pprint([(topic_labels[topic[0]], topic[1]) for topic in model[name]])

In [128]:
show_author('Stelzeneder D')


Stelzeneder D
Journals:
[('The international journal of neuropsychopharmacology', 0.54436973607307071),
 ('Expert opinion on pharmacotherapy', 0.39202442195244247)]


In [129]:
show_author('Dogan M')


Dogan M
Journals:
[('The international journal of neuropsychopharmacology', 0.83385582859807827),
 ('Health psychology : official journal of the Division of Health Psychology, '
  'American Psychological Association',
  0.028496938839331708),
 ('Expert review of clinical pharmacology', 0.086408456898801583)]


In [130]:
show_author('Quan GM')


Quan GM
Journals:
[('The international journal of neuropsychopharmacology', 0.021895163154751939),
 ('Haematologica', 0.6110058839438216),
 ('European journal of radiology', 0.33874664992282055)]


In [131]:
show_author('Uesugi K')


Uesugi K
Journals:
[('Health psychology : official journal of the Division of Health Psychology, '
  'American Psychological Association',
  0.16914440782994661),
 ('Human & experimental toxicology', 0.81428597386071266)]


In [132]:
from gensim.models import atmodel
author2doc = atmodel.construct_author2doc(model.doc2author)

In [133]:
# Compute the per-word bound.
# Number of words in corpus.
corpus_words = sum(cnt for document in model.corpus for _, cnt in document)

# Compute bound and divide by number of words.
perwordbound = model.bound(model.corpus, author2doc=model.author2doc, \
                           doc2author=model.doc2author) / corpus_words
print(perwordbound)

-7.92282602571


In [134]:
%time top_topics = model.top_topics(model.corpus)

CPU times: user 27.1 ms, sys: 0 ns, total: 27.1 ms
Wall time: 27.2 ms


In [135]:
%%time
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_author = 0  # Ignore authors with documents less than this.
authors = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_author]
_ = tsne.fit_transform(model.state.gamma[authors, :])  # Result stored in tsne.embedding_

CPU times: user 4.72 s, sys: 438 ms, total: 5.16 s
Wall time: 5.16 s


In [136]:
# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook
output_notebook()

In [137]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
author_names = [model.id2author[a] for a in authors]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 0.1
author_sizes = [len(model.author2doc[a]) for a in author_names]
radii = [size * scale for size in author_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=author_names,
            author_sizes=author_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

In [138]:
from gensim.similarities import MatrixSimilarity

# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(model.id2author.values())])

# Get similarities to some author.
author_name = 'Stelzeneder D'
sims = index[model[author_name]]

In [139]:
# Make a function that returns similarities based on the Hellinger distance.

from gensim import matutils
import pandas as pd

# Make a list of all the author-topic distributions.
author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

def similarity(vec1, vec2):
    '''Get similarity between two vectors'''
    dist = matutils.hellinger(matutils.sparse2full(vec1, model.num_topics), \
                              matutils.sparse2full(vec2, model.num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec):
    '''Get similarity of vector to all authors.'''
    sims = [similarity(vec, vec2) for vec2 in author_vecs]
    return sims

def get_table(name, top_n=10, smallest_author=1):
    '''
    Get table with similarities, author names, and author sizes.
    Return `top_n` authors as a dataframe.
    
    '''
    
    # Get similarities.
    sims = get_sims(model.get_author_topics(name))

    # Arrange author names, similarities, and author sizes in a list of tuples.
    table = []
    for elem in enumerate(sims):
        author_name = model.id2author[elem[0]]
        sim = elem[1]
        author_size = len(model.author2doc[author_name])
        if author_size >= smallest_author:
            table.append((author_name, sim, author_size))
            
    # Make dataframe and retrieve top authors.
    df = pd.DataFrame(table, columns=['Author', 'Score', 'Size'])
    df = df.sort_values('Score', ascending=False)[:top_n]
    
    return df

In [140]:
get_table('Stelzeneder D')

Unnamed: 0,Author,Score,Size
276,Stelzeneder D,1.0,1
263,Scheurecker G,0.997696,1
305,Welsch GH,0.966961,1
300,Vlychou M,0.94121,1
92,Friedrich KM,0.757013,1
207,Mulder CJ,0.74061,1
4,Akyazi H,0.737258,1
148,Kara IH,0.7128,1
209,Mungan S,0.691935,1
243,Punta G,0.679087,1
