# Anomaly Detection and Keyword Extraction in Academic Papers

## Imports

In [None]:
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import pyLDAvis.gensim
import pyLDAvis
from pprint import pprint
import spacy
import logging
import warnings
import os
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from keybert import KeyBERT
import pickle
os.environ["TOKENIZERS_PARALLELISM"] = "false"
nltk.download('stopwords')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore",category=DeprecationWarning)
DEBUG = False

## Dataset loading and preparation

In [None]:
publications_df = pd.read_csv('data/publications-converted.csv', quotechar="'", delimiter=',', quoting=2)
authors_df = pd.read_csv('data/authors_all.csv')
users_df = pd.read_csv('data/users_all.csv')
users_validated_df = pd.read_csv('data/users_validatAcceptat.csv')

publications_df = publications_df.drop_duplicates()
authors_df = authors_df.drop_duplicates()
publications_df = publications_df.drop(columns=['no_coauthors', 'publication_type', 'no_pages', 'd_oi', 
                                                'category', 'file_link', 'external_link', 'publisher', 'w_os',
                                                'jhi_type', 'cross_ref_validation', 'publication_date',
                                                'file_link_shown', 'citations_number', 'metadata', 'internal_link',
                                                'keywords_valid', 'photo_link', 'mapped_to_id', 'local_keywords',
                                                'keywords_isi'])
publications_df = publications_df.dropna(subset=['title', 'abstract_text', 'keywords', 'authors'])
langs = publications_df['abstract_lang'].unique()
print(f'Starting languages: {langs}')
# Only one publication in 'UNKNOWN' language and it is in english
publications_df.loc[publications_df['abstract_lang'] == 'UNKNOWN', 'abstract_lang'] = 'en'
# Drop publications with no abstract language
publications_df = publications_df.dropna(subset=['abstract_lang'])
langs = publications_df['abstract_lang'].unique()
print(f'Used languages: {langs}')

# Group publications by language
publications_df = publications_df.groupby('abstract_lang')

publications_all_en = publications_df.get_group('en')
publications_all_ro = publications_df.get_group('ro')

publications_all_en = publications_all_en.drop(columns=['abstract_lang'])
publications_all_ro = publications_all_ro.drop(columns=['abstract_lang'])

print(f'Number of publications in english: {len(publications_all_en)}')
print(f'Number of publications in romanian: {len(publications_all_ro)}')

## Various methods of fetching and filtering publications

In [None]:
def get_publications_by_user_name(publications, authors, users, first_name, last_name):
    if len(first_name) == 0 or len(last_name) == 0:
        print('First name or last name is empty, returning full publications set')
        return pd.DataFrame()
    first_name = first_name.lower()
    last_name = last_name.lower()
    user = users.loc[(users['first_name'].str.lower().str.contains(first_name)) & (users['last_name'].str.lower().str.contains(last_name))].values
    if len(user) == 0:
        print(f'No user found with first name: {first_name} and last name: {last_name}')
        return pd.DataFrame()
    user = user[0]
    user_publications_ids = authors.loc[authors['user_id'] == user[0]]['publication_id'].values
    if len(user_publications_ids) == 0:
        print(f'No publications found in this set for user: {user}')
        return pd.DataFrame()
    user_publications = publications.loc[publications['id'].isin(user_publications_ids)]
    if len(user_publications) == 0:
        print(f'No publications found in this set for user: {user}')
        return user_publications
    print(f'{len(user_publications)} publications found in this set for user: {user}')
    return user_publications

def get_publications_by_user_name_and_publication_name(publications, authors, users, first_name, last_name, pub_title):
    user_publications = get_publications_by_user_name(publications, authors, users, first_name, last_name)
    if pub_title and len(pub_title) != 0:
        random_single_publication = user_publications.loc[user_publications['title'].str.lower().str.contains(pub_title.lower())]
        user_publications = user_publications.loc[user_publications['id'] != random_single_publication['id'].values[0]]
    else:
        print('No publication title provided, returning random publication')
        random_single_publication = user_publications.sample(n=1)
        user_publications = user_publications.loc[user_publications['id'] != random_single_publication['id'].values[0]]
    print(f'Random single publication: {random_single_publication.values[0]}')
    return user_publications, random_single_publication

def get_user_id_by_user_name(users, first_name, last_name):
    if len(first_name) == 0 or len(last_name) == 0:
        print('First name or last name is empty, returning full publications set')
        return pd.DataFrame()
    first_name = first_name.lower()
    last_name = last_name.lower()
    user = users.loc[(users['first_name'].str.lower().str.contains(first_name)) & (users['last_name'].str.lower().str.contains(last_name))].values[0]
    return user[0]

def get_publications_by_user_id(publications, authors, user_id):
    if user_id is None or user_id == 0:
        print('No user id')
        return pd.DataFrame()
    user_publications_ids = authors.loc[authors['user_id'] == user_id]['publication_id'].values
    if len(user_publications_ids) == 0:
        print(f'No publications found in this set for user: {user_id}')
        return pd.DataFrame()
    user_publications = publications.loc[publications['id'].isin(user_publications_ids)]
    if len(user_publications) == 0:
        print(f'No publications found in this set for user: {user_id}')
        return user_publications
    print(f'{len(user_publications)} publications found in this set for user: {user_id}')
    return user_publications

## Hyperparameters for text processing and LDA

In [None]:
# python3 -m spacy download en_core_web_trf
lemmatization_model_en = 'en_core_web_trf'
# python3 -m spacy download ro_core_news_lg
lemmatization_model_ro = 'ro_core_news_lg'

stop_words_en = stopwords.words('english')
stop_words_en.extend(['from', 'subject', 're', 'edu', 'use', 'result', 'datum'])
stop_words_ro = stopwords.words('romanian')
stop_words_ro.extend(['tip', 'of', 'the', 'for'])

# Set LDA parameters
num_topics = 10
num_words = 20
chunksize = 2000
passes = 5
alpha = 'asymmetric'
eta = 'auto'
iterations = 500
eval_every = 1
workers = 4
random_state = 42

## Fetching publications for a specific user

In [None]:
user_first_name = input('Enter author first name: ') 
user_last_name = input('Enter author last name: ')
# Testing specific case in which author published something very different from his usual publications
publications_en, random_author_publication_en = get_publications_by_user_name_and_publication_name(publications_all_en, authors_df, users_df, user_first_name, user_last_name, 'Indoor Positioning')
publications_en, other_random_author_publication_en = get_publications_by_user_name_and_publication_name(publications_en, authors_df, users_df, user_first_name, user_last_name, None)
publications_ro = get_publications_by_user_name(publications_all_ro, authors_df, users_df, user_first_name, user_last_name)

## Text processing functions

In [None]:
def cleanup_text(texts):
    if len(texts) == 0:
        return texts
    # Remove punctuation, newlines and tabs
    texts = texts.map(lambda x: re.sub('[,\\.!?`\'\n\t•„"\\(\\)]', '', x))
    # Remove numbers
    texts = texts.map(lambda x: re.sub('[0-9]', '', x))
    # Convert to lowercase
    texts = texts.map(lambda x: x.lower())
    return texts

def split_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts, stop_words):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def preprocess_text(data, lemmatization_model, stop_words):
    if len(data) == 0 or len(lemmatization_model) == 0 or len(stop_words) == 0:
        return []
    
    # tokenize
    data_words = list(split_words(data))

    # Initialize spacy model, keeping only tagger component (for efficiency)
    nlp = spacy.load(lemmatization_model, disable=['parser', 'ner'])

    # lemmatization
    data_words = lemmatization(data_words, nlp)

    # remove stop words
    data_words = remove_stopwords(data_words, stop_words)

    return data_words

## LDA functions

In [None]:
def create_dictionary_and_corpus(data_words):
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    # Create Corpus
    texts = data_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    return id2word, corpus

def build_lda_model(corpus, id2word, num_topics_param, num_words_param):
    if len(corpus) == 0 or len(id2word) == 0:
        return None, None
    
    # Build LDA model
    lda_model = gensim.models.LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics_param,
        chunksize=chunksize,
        passes=passes,
        alpha=alpha,
        eta=eta,
        eval_every=eval_every,
        iterations=iterations,
        workers=workers,
        random_state=random_state
    )

    doc_lda = None
    if lda_model is not None:
        if DEBUG:
            pprint(lda_model.print_topics(num_topics=num_topics_param, num_words=num_words_param))
        doc_lda = lda_model[corpus]

    return lda_model, doc_lda

def create_lda_for_publications(publications, lemmatization_model, stop_words, num_topics_param, num_words_param):
    publications['abstract_text_processed'] = cleanup_text(publications['abstract_text'])
    if DEBUG:
        print(publications['abstract_text_processed'].head())

    data = publications['abstract_text_processed'].values.tolist()

    data_words = preprocess_text(data, lemmatization_model, stop_words)
    if DEBUG:
        print(data_words[:1])

    id2word, corpus = create_dictionary_and_corpus(data_words)

    if DEBUG:
        print(corpus[:1])
        print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

    lda_model, doc_lda = build_lda_model(corpus, id2word, num_topics_param, num_words_param)

    return lda_model, doc_lda, id2word, corpus, data_words

## Creating LDA models

In [None]:
lda_model_en, doc_lda_en, id2word_en, corpus_en, data_words_en = create_lda_for_publications(publications_en, lemmatization_model_en, stop_words_en, num_topics, num_words)
lda_model_ro, doc_lda_ro, id2word_ro, corpus_ro, data_words_ro = create_lda_for_publications(publications_ro, lemmatization_model_ro, stop_words_ro, num_topics, num_words)

## Word clouds

In [None]:
def display_wordcloud(data_words):
    if len(data_words) == 0:
        data_words = [['none']]
    # Join the different processed titles together.
    long_string = ','.join([word for sublist in data_words for word in sublist])
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue', width=800, height=600)
    # Generate a word cloud
    wordcloud.generate(long_string)
    return wordcloud

In [None]:
display_wordcloud(data_words_en).to_image()

In [None]:
display_wordcloud(data_words_ro).to_image()

## Basic metrics and topic visualization

In [None]:
def compute_metrics(lda_model, model_description, corpus, id2word, data_words):
    if lda_model is None or len(corpus) == 0 or len(id2word) == 0 or len(data_words) == 0:
        print(f'No metrics for {model_description}\n')
        return
    
    # Compute Perplexity
    print(f'Perplexity for {model_description}: {lda_model.log_perplexity(corpus)}\n')  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f'Coherence Score for {model_description}: {coherence_lda}\n')

compute_metrics(lda_model_en, 'EN', corpus_en, id2word_en, data_words_en)
compute_metrics(lda_model_ro, 'RO', corpus_ro, id2word_ro, data_words_ro)

In [None]:
lda_model = lda_model_en
corpus = corpus_en
id2word = id2word_en

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

## Finding optimal number of topics for LDA

In [None]:
def compute_coherence_values(dictionary, corpus, texts, num_topics_range):
    coherence_values = []
    model_data_list = []
    for num_topics_param in num_topics_range:
        if num_topics_param == 0:
            num_topics_param = 1
        model_data = build_lda_model(corpus, id2word, num_topics_param, num_words)
        model_data_list.append(model_data)
        coherencemodel = CoherenceModel(model=model_data[0], texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(round(coherencemodel.get_coherence(), 2))

    return model_data_list, coherence_values

In [None]:
start = 0
limit = 30
step = 5
num_topics_range = [1 if i == 0 else i for i in range(start, limit, step)]
model_list_en, coherence_values_en = compute_coherence_values(id2word_en, corpus_en, data_words_en, num_topics_range)

In [None]:
# Show graph
plt.plot(num_topics_range, coherence_values_en)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(num_topics_range, coherence_values_en):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
num_topics = max(list(zip(num_topics_range, coherence_values_en)), key=lambda x: x[1])[0]
print(f'Optimal number of topics = {num_topics}')
lda_model_en, doc_lda_en = model_list_en[coherence_values_en.index(max(coherence_values_en))]

## Finding out the dominant topic for each doc in the corpus

In [None]:
def get_dominant_topics(lda_model, doc_lda, texts):
    sent_topics_df = pd.DataFrame(columns=['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'])

    for i, row in enumerate(doc_lda):
        dominant_topic = max(row, key=lambda x: x[1])
        dominant_topic_id = dominant_topic[0]
        dominant_topic_contrib_perc = dominant_topic[1]

        topic = lda_model.show_topic(dominant_topic[0])
        topic_keywords = ",".join([word for word, _ in topic])
        sent_topics_df.loc[i] = [i, dominant_topic_id, dominant_topic_contrib_perc, topic_keywords, texts[i]]

    return sent_topics_df


df_dominant_topic = get_dominant_topics(lda_model_en, doc_lda_en, data_words_en)

# Show
df_dominant_topic

## Comparing a new paper to the author's corpus

In [None]:
def compare_single_publication(random_single_publication, model_to_compare, lemmatization_model, stop_words, distance):
    lda_model, _, _, _, _ = create_lda_for_publications(random_single_publication, lemmatization_model, stop_words, num_topics, num_words)

    if lda_model is not None:
        if DEBUG:
            pprint(lda_model.print_topics(num_topics=num_topics, num_words=num_words))
            pprint(model_to_compare.print_topics(num_topics=num_topics, num_words=num_words))
        mdiff, annotation = lda_model.diff(model_to_compare, distance=distance, num_words=num_words, n_ann_terms=num_words)
        if DEBUG:
            print(mdiff)
            print(annotation)
        return mdiff, annotation, np.mean(mdiff)
    else:
        print('No model')
        return None, None, None

def plot_difference(mdiff, title=""):
    _, ax = plt.subplots(figsize=(10, 10))
    data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower', vmin=0.75, vmax=1)
    plt.title(title)
    plt.colorbar(data)

In [None]:
print(f"Picked random paper from same author:\nid={random_author_publication_en['id'].values[0]} {random_author_publication_en['title'].values[0]}\nfrom authors:\n{random_author_publication_en['authors'].values[0]}")
mdiff, annotation, mdiff_mean = compare_single_publication(random_author_publication_en, lda_model_en, lemmatization_model_en, stop_words_en, 'jaccard')
plot_difference(mdiff, title="Topic difference (random paper from same author) [jaccard distance]")
print(f"Mean difference: {mdiff_mean}")

print(f"Picked other random paper from same author:\nid={other_random_author_publication_en['id'].values[0]} {other_random_author_publication_en['title'].values[0]}\nfrom authors:\n{other_random_author_publication_en['authors'].values[0]}")
mdiff, annotation, mdiff_mean = compare_single_publication(other_random_author_publication_en, lda_model_en, lemmatization_model_en, stop_words_en, 'jaccard')
plot_difference(mdiff, title="Topic difference (random paper from same author) [jaccard distance]")
print(f"Mean difference: {mdiff_mean}")

random_single_publication_en = publications_all_en.sample(n=1)
print(f"Picked random paper from full dataset:\n{random_single_publication_en['title'].values[0]}\nfrom authors:\n{random_single_publication_en['authors'].values[0]}")
mdiff, annotation, mdiff_mean = compare_single_publication(random_single_publication_en, lda_model_en, lemmatization_model_en, stop_words_en, 'jaccard')
plot_difference(mdiff, title="Topic difference (random paper from dataset) [jaccard distance]")
print(f"Mean difference: {mdiff_mean}")


## SVM classification for LDA topic diffs for chosen author

In [None]:
def create_author_lda_dataset(publications, other_publications, sample_size):
    author_publications = publications.sample(n=sample_size)
    author_publications = author_publications.reset_index(drop=True)
    non_author_publications = other_publications[~other_publications['id'].isin(author_publications['id'])]
    author_lda_model, _, _, _, _ = create_lda_for_publications(author_publications, lemmatization_model_en, stop_words_en, num_topics, num_words)
    train_set_size = 2 * len(author_publications)
    x = []
    y = []

    for i in range(train_set_size):
        if i < len(author_publications):
            author_publication = author_publications.loc[i, :].to_frame().T
            other_author_publications = author_publications.drop(author_publication.index)
            other_author_lda_model, _, _, _, _ = create_lda_for_publications(other_author_publications, lemmatization_model_en, stop_words_en, num_topics, num_words) 
            mdiff, _, _ = compare_single_publication(author_publication, other_author_lda_model, lemmatization_model_en, stop_words_en, 'jaccard')
            if mdiff is not None:
                x.append(mdiff.flatten())
                y.append(1)
        else:
            random_non_author_publication_en = non_author_publications.sample(n=1)
            non_author_publications = non_author_publications.drop(random_non_author_publication_en.index)
            mdiff, _, _ = compare_single_publication(random_non_author_publication_en, author_lda_model, lemmatization_model_en, stop_words_en, 'jaccard')
            if mdiff is not None:
                x.append(mdiff.flatten())
                y.append(0) 
        
    return x, y

In [None]:
SAVE = True

In [None]:
x, y = create_author_lda_dataset(publications_en, publications_all_en, len(publications_en))

if SAVE:
    with open('data/lda_diff_chosen_author_x.pickle', 'wb') as f:
        pickle.dump(x, f)
    with open('data/lda_diff_chosen_author_y.pickle', 'wb') as f:
        pickle.dump(y, f)

In [None]:
try:
    with open('data/lda_diff_chosen_author_x.pickle', 'rb') as f:
        x = pickle.load(f)
    with open('data/lda_diff_chosen_author_y.pickle', 'rb') as f:
        y = pickle.load(f)
except FileNotFoundError:
    print('Run previous cell first')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

clf = svm.SVC()
clf.fit(x_train, y_train)

x_test = scaler.transform(x_test)
y_pred = clf.predict(x_test)

f1_score_result = round(f1_score(y_test, y_pred), 2)
accuracy_score_result = round(accuracy_score(y_test, y_pred), 2)
print(f"F1 score: {f1_score_result}")
print(f"Accuracy score: {accuracy_score_result}")

In [None]:
print(f"Picked random paper from same author:\nid={random_author_publication_en['id'].values[0]} {random_author_publication_en['title'].values[0]}\nfrom authors:\n{random_author_publication_en['authors'].values[0]}")
mdiff, _, _ = compare_single_publication(random_author_publication_en, lda_model_en, lemmatization_model_en, stop_words_en, 'jaccard')
pred = clf.predict(scaler.transform(mdiff.flatten().reshape(1, -1)))
print(f"Prediction: {'' if pred[0] == 1 else 'not '}same author")

print(f"Picked other random paper from same author:\nid={other_random_author_publication_en['id'].values[0]} {other_random_author_publication_en['title'].values[0]}\nfrom authors:\n{other_random_author_publication_en['authors'].values[0]}")
mdiff, _, _ = compare_single_publication(other_random_author_publication_en, lda_model_en, lemmatization_model_en, stop_words_en, 'jaccard')
pred = clf.predict(scaler.transform(mdiff.flatten().reshape(1, -1)))
print(f"Prediction: {'' if pred[0] == 1 else 'not '}same author")

print(f"Picked random paper from full dataset:\n{random_single_publication_en['title'].values[0]}\nfrom authors:\n{random_single_publication_en['authors'].values[0]}")
mdiff, _, _ = compare_single_publication(random_single_publication_en, lda_model_en, lemmatization_model_en, stop_words_en, 'jaccard')
pred = clf.predict(scaler.transform(mdiff.flatten().reshape(1, -1)))
print(f"Prediction: {'' if pred[0] == 1 else 'not '}same author")

## SVM classification for LDA topic diffs using one classifier per author

Due to this step taking a long time to run, the number of authors selected and publications per author has been reduced.
This section should also be run only once and the results should be loaded from the pickle file for subsequent runs.

In [None]:
users_publications = []
for user_id in users_df['id'].values:
    user_publications = get_publications_by_user_id(publications_all_en, authors_df, user_id)
    if len(user_publications) > 0:
        users_publications.append((user_id, user_publications))

print(f'Found {len(users_publications)} users with publications')

In [None]:
filtered_users_publications = list(filter(lambda x: len(x[1]) > 20, users_publications))
len(filtered_users_publications)

In [None]:
SAVE = True

In [None]:
per_author_classifier_results = pd.DataFrame(columns=['author_id', 'f1_score', 'accuracy_score'])

all_author_x = []
all_author_y = []

for user_data in tqdm(filtered_users_publications):
    x, y = create_author_lda_dataset(user_data[1], publications_all_en, 10)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)

    clf = svm.SVC()
    clf.fit(x_train, y_train)

    x_test = scaler.transform(x_test)
    y_pred = clf.predict(x_test)

    f1_score_result = round(f1_score(y_test, y_pred), 2)
    accuracy_score_result = round(accuracy_score(y_test, y_pred), 2)
    per_author_classifier_results.loc[len(per_author_classifier_results)] = [user_data[0], f1_score_result, accuracy_score_result]
    all_author_x.extend(x)
    all_author_y.extend(y)

if SAVE:
    per_author_classifier_results.to_csv('data/per_author_lda_classifier_results.csv', index=False)
    with open('data/lda_diff_all_author_x.pickle', 'wb') as f:
        pickle.dump(all_author_x, f)
    with open('data/lda_diff_all_author_y.pickle', 'wb') as f:
        pickle.dump(all_author_y, f)

## SVM classification for LDA topic diffs using one classifier for all authors

In [None]:
try:
    with open('data/lda_diff_all_author_x.pickle', 'rb') as f:
        all_author_x = pickle.load(f)
    with open('data/lda_diff_all_author_y.pickle', 'rb') as f:
        all_author_y = pickle.load(f)
except FileNotFoundError:
    print('Run previous cell first')

x_train, x_test, y_train, y_test = train_test_split(all_author_x, all_author_y, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

clf = svm.SVC()
clf.fit(x_train, y_train)

x_test = scaler.transform(x_test)
y_pred = clf.predict(x_test)

f1_score_result = round(f1_score(y_test, y_pred), 2)
accuracy_score_result = round(accuracy_score(y_test, y_pred), 2)
print(f'Classifier results for all authors: f1_score = {f1_score_result} accuracy_score = {accuracy_score_result}')
all_author_classifier_results = pd.DataFrame(columns=['f1_score', 'accuracy_score'])
all_author_classifier_results.loc[len(all_author_classifier_results)] = [f1_score_result, accuracy_score_result]
all_author_classifier_results.to_csv('data/all_author_classifier_results.csv', index=False)

## Keyword extraction using KeyBERT

In [None]:
kw_model_en = KeyBERT()
keyphrase_ngram_range = (1, 1)
num_keywords = 20
author_keywords = kw_model_en.extract_keywords(publications_en['abstract_text'].values, keyphrase_ngram_range=keyphrase_ngram_range, stop_words="english", top_n=num_keywords)
print(author_keywords)

In [None]:
random_author_publication_keywords = kw_model_en.extract_keywords(random_author_publication_en['abstract_text'].values, keyphrase_ngram_range=keyphrase_ngram_range, stop_words="english", top_n=num_keywords)
print(random_author_publication_keywords)
other_random_author_publication_keywords = kw_model_en.extract_keywords(other_random_author_publication_en['abstract_text'].values, keyphrase_ngram_range=keyphrase_ngram_range, stop_words="english", top_n=num_keywords)
print(other_random_author_publication_keywords)
random_single_publication_keywords = kw_model_en.extract_keywords(random_single_publication_en['abstract_text'].values, keyphrase_ngram_range=keyphrase_ngram_range, stop_words="english", top_n=num_keywords)
print(random_single_publication_keywords)

## Calculating the simmilarity score based on keywords and weights

We will use the formula proposed by Courtney Corley and Rada Mihalcea in Measuring the Semantic Similarity of Texts and modify it to use the contextualized word embeddings from BERT.
First we will compute keyword overlap between two documents and then relatedness between overlapping keywords in those documents.
Relatedness will be computed using cosine similarity of word embeddings for overlapping keywords. Further, we will use the average relatedness of all pairs of overlapping keywords.
Finally the similarity score will be calculated by multiplying the word overlap by the average relatedness.
When comparing a new publication with the author's corpus, we will compute the similarity score to each publication in the corpus and compute the average of those scores.

In [None]:
def compute_word_overlap(keywords1, keywords2):
    keywords_set1 = set(keywords1)
    keywords_set2 = set(keywords2)
    intersection = keywords_set1.intersection(keywords_set2)
    return len(intersection), [keywords1.index(k) for k in intersection], [keywords2.index(k) for k in intersection]

def compute_word_relatedness(embeddings1, embeddings2):
    return np.dot(embeddings1, embeddings2) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))

def compute_doc_similarity(doc1, doc2, kw_model, keyphrase_ngram_range, stop_words):
    keywords1 = kw_model.extract_keywords(doc1, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words, top_n=num_keywords)
    keywords2 = kw_model.extract_keywords(doc2, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words, top_n=num_keywords)
    keywords1 = [k[0] for k in keywords1]
    keywords2 = [k[0] for k in keywords2]
    _, embeddings1 = kw_model.extract_embeddings(doc1, candidates=keywords1, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words)
    _, embeddings2 = kw_model.extract_embeddings(doc2, candidates=keywords2, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words)
    word_overlap, overalp_ids1, overlap_ids2 = compute_word_overlap(keywords1, keywords2)
    if word_overlap == 0:
        return 0
    overlap_embeddings1 = [embeddings1[i] for i in overalp_ids1]
    overlap_embeddings2 = [embeddings2[i] for i in overlap_ids2]
    word_relatedness_scores = [compute_word_relatedness(e1, e2) for e1, e2 in zip(overlap_embeddings1, overlap_embeddings2)]
    word_relatedness = np.mean(word_relatedness_scores)
    return word_overlap * word_relatedness

def compute_publication_similarity_with_author(publication, author_publications, kw_model, keyphrase_ngram_range, stop_words):
    publication_similarity_scores = []
    publication_text = publication['abstract_text'].values[0]
    for author_publication_text in author_publications['abstract_text'].values:
        publication_similarity_scores.append(compute_doc_similarity(publication_text, author_publication_text, kw_model, keyphrase_ngram_range, stop_words))
    return np.mean(publication_similarity_scores)

In [None]:
similarity = compute_publication_similarity_with_author(random_author_publication_en, publications_en, kw_model_en, keyphrase_ngram_range, "english")
print(f'Similarity for publication {random_author_publication_en["id"].values[0]} with title {random_author_publication_en["title"].values[0]} = {similarity}')
similarity = compute_publication_similarity_with_author(other_random_author_publication_en, publications_en, kw_model_en, keyphrase_ngram_range, "english")
print(f'Similarity for publication {other_random_author_publication_en["id"].values[0]} with title {other_random_author_publication_en["title"].values[0]} = {similarity}')
similarity = compute_publication_similarity_with_author(random_single_publication_en, publications_en, kw_model_en, keyphrase_ngram_range, "english")
print(f'Similarity for publication {random_single_publication_en["id"].values[0]} with title {random_single_publication_en["title"].values[0]} = {similarity}')

## SVM classification of similarity scores computed for chosen author

In [None]:
def create_author_keyword_dataset(publications, other_publications, sample_size):
    author_publications = publications.sample(n=sample_size)
    author_publications = author_publications.reset_index(drop=True)
    non_author_publications = other_publications[~other_publications['id'].isin(author_publications['id'])]
    train_set_size = 2 * len(author_publications)
    x = []
    y = []

    for i in range(train_set_size):
        if i < len(author_publications):
            author_publication = author_publications.loc[i, :].to_frame().T
            other_author_publications = author_publications.drop(author_publication.index)
            similarity = compute_publication_similarity_with_author(author_publication, other_author_publications, kw_model_en, keyphrase_ngram_range, "english")
            x.append(similarity)
            y.append(1)
        else:
            random_non_author_publication_en = non_author_publications.sample(n=1)
            similarity = compute_publication_similarity_with_author(random_non_author_publication_en, author_publications, kw_model_en, keyphrase_ngram_range, "english")
            x.append(similarity)
            y.append(0)
        
    return x, y

In [None]:
SAVE = True

In [None]:
x, y = create_author_keyword_dataset(publications_en, publications_all_en, len(publications_en))
x = np.array(x).reshape(-1, 1)

if SAVE:
    with open('data/keyword_similarity_chosen_author_x.pickle', 'wb') as f:
        pickle.dump(x, f)
    with open('data/keyword_similarity_chosen_author_y.pickle', 'wb') as f:
        pickle.dump(y, f)

In [None]:
try:
    with open('data/keyword_similarity_chosen_author_x.pickle', 'rb') as f:
        x = pickle.load(f)
    with open('data/keyword_similarity_chosen_author_y.pickle', 'rb') as f:
        y = pickle.load(f)
except FileNotFoundError:
    print('Run previous cell first')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

clf = svm.SVC()
clf.fit(x_train, y_train)

x_test = scaler.transform(x_test)
y_pred = clf.predict(x_test)

f1_score_result = round(f1_score(y_test, y_pred), 2)
accuracy_score_result = round(accuracy_score(y_test, y_pred), 2)
print(f"F1 score: {f1_score_result}")
print(f"Accuracy score: {accuracy_score_result}")

In [None]:
print(f"Picked random paper from same author:\nid={random_author_publication_en['id'].values[0]} {random_author_publication_en['title'].values[0]}\nfrom authors:\n{random_author_publication_en['authors'].values[0]}")
similarity = compute_publication_similarity_with_author(random_author_publication_en, publications_en, kw_model_en, keyphrase_ngram_range, "english")
pred = clf.predict(scaler.transform(np.array(similarity).reshape(1, -1)))
print(f"Prediction: {'' if pred[0] == 1 else 'not '}same author")

print(f"Picked other random paper from same author:\nid={other_random_author_publication_en['id'].values[0]} {other_random_author_publication_en['title'].values[0]}\nfrom authors:\n{other_random_author_publication_en['authors'].values[0]}")
similarity = compute_publication_similarity_with_author(other_random_author_publication_en, publications_en, kw_model_en, keyphrase_ngram_range, "english")
pred = clf.predict(scaler.transform(np.array(similarity).reshape(1, -1)))
print(f"Prediction: {'' if pred[0] == 1 else 'not '}same author")

print(f"Picked random paper from full dataset:\n{random_single_publication_en['title'].values[0]}\nfrom authors:\n{random_single_publication_en['authors'].values[0]}")
similarity = compute_publication_similarity_with_author(random_single_publication_en, publications_en, kw_model_en, keyphrase_ngram_range, "english")
pred = clf.predict(scaler.transform(np.array(similarity).reshape(1, -1)))
print(f"Prediction: {'' if pred[0] == 1 else 'not '}same author")