In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import pyLDAvis.gensim
import pyLDAvis
from pprint import pprint
import spacy
import logging
import warnings
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
publications_df = pd.read_csv('data/publications-converted.csv', quotechar="'", delimiter=',', quoting=2)
authors_df = pd.read_csv('data/authors_all.csv')
users_df = pd.read_csv('data/users_all.csv')
users_validated_df = pd.read_csv('data/users_validatAcceptat.csv')

publications_df = publications_df.drop(columns=['no_coauthors', 'publication_type', 'no_pages', 'd_oi', 
                                                'category', 'file_link', 'external_link', 'publisher', 'w_os',
                                                'jhi_type', 'cross_ref_validation', 'publication_date',
                                                'file_link_shown', 'citations_number', 'metadata', 'internal_link',
                                                'keywords_valid', 'photo_link', 'mapped_to_id', 'local_keywords',
                                                'keywords_isi'])
publications_df = publications_df.dropna(subset=['title', 'abstract_text', 'keywords'])
langs = publications_df['abstract_lang'].unique()
print(f'Starting languages: {langs}')
# Only one publication in 'UNKNOWN' language and it is in english
publications_df.loc[publications_df['abstract_lang'] == 'UNKNOWN', 'abstract_lang'] = 'en'
# Drop publications with no abstract language
publications_df = publications_df.dropna(subset=['abstract_lang'])
langs = publications_df['abstract_lang'].unique()
print(f'Used languages: {langs}')

# Group publications by language
publications_df = publications_df.groupby('abstract_lang')

publications_en = publications_df.get_group('en')
publications_ro = publications_df.get_group('ro')

publications_en = publications_en.drop(columns=['abstract_lang'])
publications_ro = publications_ro.drop(columns=['abstract_lang'])

In [None]:
def get_publications_by_user_name(publications, first_name, last_name):
    if len(first_name) == 0 or len(last_name) == 0:
        print('First name or last name is empty, returning full publications set')
        return publications, pd.DataFrame()
    first_name = first_name.lower()
    last_name = last_name.lower()
    user = users_df.loc[(users_df['first_name'].str.lower().str.contains(first_name)) & (users_df['last_name'].str.lower().str.contains(last_name))].values[0]
    user_publications_ids = users_validated_df.loc[users_validated_df['user_id'] == user[0]]['publication_id'].values
    user_publications = publications.loc[publications['id'].isin(user_publications_ids)]
    if len(user_publications) == 0:
        print(f'No publications found in this set for user: {user}')
        return user_publications, pd.DataFrame()
    random_single_publication = user_publications.sample()
    user_publications = user_publications.loc[user_publications['id'] != random_single_publication['id'].values[0]]
    return user_publications, random_single_publication

user_first_name = input('Enter user first name: ')
user_last_name = input('Enter user last name: ')
publications_en, random_single_publication_en = get_publications_by_user_name(publications_en, user_first_name, user_last_name)
publications_ro, random_single_publication_ro = get_publications_by_user_name(publications_ro, user_first_name, user_last_name)

In [None]:
def cleanup_text(texts):
    if len(texts) == 0:
        return texts
    # Remove punctuation, newlines and tabs
    texts = texts.map(lambda x: re.sub('[,\\.!?`\'\n\t•„"]', '', x))
    # Remove numbers
    texts = texts.map(lambda x: re.sub('[0-9]', '', x))
    # Convert to lowercase
    texts = texts.map(lambda x: x.lower())
    return texts

publications_en['abstract_text_processed'] = cleanup_text(publications_en['abstract_text'])
print(publications_en['abstract_text_processed'].head())
publications_ro['abstract_text_processed'] = cleanup_text(publications_ro['abstract_text'])
print(publications_ro['abstract_text_processed'].head())

In [None]:
def split_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def build_bigram_trigram_models(data_words):
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    return bigram_mod, trigram_mod

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts, bigram_mod, trigram_mod):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def preprocess_text(data, lemmatization_model, stop_words):
    if len(data) == 0 or len(lemmatization_model) == 0 or len(stop_words) == 0:
        return []
    
    # tokenize
    data_words = list(split_words(data))

    # Initialize spacy model, keeping only tagger component (for efficiency)
    nlp = spacy.load(lemmatization_model, disable=['parser', 'ner'])

    # lemmatization
    data_words = lemmatization(data_words, nlp)

    # remove stop words
    data_words = remove_stopwords(data_words, stop_words)

    bigram_mod, trigram_mod = build_bigram_trigram_models(data_words)

    # # form bigrams
    # data_words = make_bigrams(data_words, bigram_mod)

    return data_words

# python3 -m spacy download en_core_web_trf
lemmatization_model_en = 'en_core_web_trf'
# python3 -m spacy download ro_core_news_lg
lemmatization_model_ro = 'ro_core_news_lg'

stop_words_en = stopwords.words('english')
stop_words_en.extend(['from', 'subject', 're', 'edu', 'use', 'result', 'datum'])
stop_words_ro = stopwords.words('romanian')
stop_words_ro.extend(['tip', 'of', 'the', 'for'])

data_en = publications_en['abstract_text_processed'].values.tolist()
data_ro = publications_ro['abstract_text_processed'].values.tolist()

data_words_en = preprocess_text(data_en, lemmatization_model_en, stop_words_en)
data_words_ro = preprocess_text(data_ro, lemmatization_model_ro, stop_words_ro)

print(data_words_en[:1])
print(data_words_ro[:1])

In [None]:
def display_wordcloud(data_words):
    if len(data_words) == 0:
        data_words = [['none']]
    # Join the different processed titles together.
    long_string = ','.join([word for sublist in data_words for word in sublist])
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue', width=800, height=600)
    # Generate a word cloud
    wordcloud.generate(long_string)
    return wordcloud

In [None]:
display_wordcloud(data_words_en).to_image()

In [None]:
display_wordcloud(data_words_ro).to_image()

In [None]:
def create_dictionary_and_corpus(data_words):
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    # Create Corpus
    texts = data_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    return id2word, corpus

id2word_en, corpus_en = create_dictionary_and_corpus(data_words_en)
id2word_ro, corpus_ro = create_dictionary_and_corpus(data_words_ro)

# View
print(corpus_en[:1])
print(corpus_ro[:1])

# Human readable format of corpus (term-frequency)
print([[(id2word_en[id], freq) for id, freq in cp] for cp in corpus_en[:1]])
print([[(id2word_ro[id], freq) for id, freq in cp] for cp in corpus_ro[:1]])

In [None]:
# Set LDA parameters
num_topics = 10
num_words = 20
chunksize = 2000
passes = 5
alpha = 'asymmetric'
eta = 'auto'
iterations = 500
eval_every = 1
workers = 4

In [None]:
def build_lda_model(corpus, id2word, num_topics_param, chunksize_param, passes_param, alpha_param, eta_param, eval_every_param, iterations_param, workers_param):
    if len(corpus) == 0 or len(id2word) == 0:
        return None
    
    # Build LDA model
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics_param,
        chunksize=chunksize_param,
        passes=passes_param,
        alpha=alpha_param,
        eta=eta_param,
        eval_every=eval_every_param,
        iterations=iterations_param,
        workers=workers_param
    )

    return lda_model


lda_model_en = build_lda_model(corpus_en, id2word_en, num_topics, chunksize, passes, alpha, eta, eval_every, iterations, workers)
lda_model_ro = build_lda_model(corpus_ro, id2word_ro, num_topics, chunksize, passes, alpha, eta, eval_every, iterations, workers)

# Print the Keyword in the 10 topics
if lda_model_en is not None:
    pprint(lda_model_en.print_topics(num_topics=num_topics, num_words=num_words))
    doc_lda_en = lda_model_en[corpus_en]
if lda_model_ro is not None:
    pprint(lda_model_ro.print_topics(num_topics=num_topics, num_words=num_words))
    doc_lda_ro = lda_model_ro[corpus_ro]

In [None]:
def compute_metrics(lda_model, model_description, corpus, id2word, data_words):
    if lda_model is None or len(corpus) == 0 or len(id2word) == 0 or len(data_words) == 0:
        print(f'No metrics for {model_description}\n')
        return
    
    # Compute Perplexity
    print(f'Perplexity for {model_description}: {lda_model.log_perplexity(corpus)}\n')  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f'Coherence Score for {model_description}: {coherence_lda}\n')

compute_metrics(lda_model_en, 'EN', corpus_en, id2word_en, data_words_en)
compute_metrics(lda_model_ro, 'RO', corpus_ro, id2word_ro, data_words_ro)

In [None]:
lda_model = lda_model_en
corpus = corpus_en
id2word = id2word_en

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
def compare_single_publication(random_single_publication, model_to_compare, lemmatization_model, stop_words):
    random_single_publication['abstract_text_processed'] = cleanup_text(random_single_publication['abstract_text'])

    data = random_single_publication['abstract_text_processed'].values.tolist()
    data_words = preprocess_text(data, lemmatization_model, stop_words)

    id2word, corpus = create_dictionary_and_corpus(data_words)

    lda_model = build_lda_model(corpus, id2word, num_topics, chunksize, passes, alpha, eta, eval_every, iterations, workers)
    if lda_model is not None:
        # pprint(lda_model.print_topics(num_topics=num_topics, num_words=num_words))
        # pprint(model_to_compare.print_topics(num_topics=num_topics, num_words=num_words))
        mdiff, annotation = lda_model.diff(model_to_compare, distance='jaccard', num_words=num_words, n_ann_terms=num_words)
        # print(mdiff)
        # print(annotation)
        return mdiff, annotation
    else:
        print('No model')
        return None, None

def plot_difference(mdiff, title=""):
    _, ax = plt.subplots(figsize=(10, 10))
    data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower', vmin=0.75, vmax=1)
    plt.title(title)
    plt.colorbar(data)


mdiff, annotation = compare_single_publication(random_single_publication_en, lda_model_en, lemmatization_model_en, stop_words_en)
print(f"Picked random paper from same author:\n{random_single_publication_en['title'].values[0]}\nfrom authors:\n{random_single_publication_en['authors'].values[0]}")
plot_difference(mdiff, title="Topic difference (random paper from same author) [jaccard distance]")

publications_all_en = publications_df.get_group('en')
other_random_single_publication_en = publications_all_en.sample()
print(f"Picked random paper from full dataset:\n{other_random_single_publication_en['title'].values[0]}\nfrom authors:\n{other_random_single_publication_en['authors'].values[0]}")
mdiff, annotation = compare_single_publication(other_random_single_publication_en, lda_model_en, lemmatization_model_en, stop_words_en)
plot_difference(mdiff, title="Topic difference (random paper from dataset) [jaccard distance]")
