# Abstract Encoder
This notebook is part of the streamlit application, which is contained in this directory. The result of this investigation is displayed in the application. The goal of this notebook is to investigate document embeddings using 2 pretrained models from huggingface and compare the results. After performing the document embedding we will perform Principal Component Analysis(PCA) in order to give the user an idea what different clusters of topics the person worked in. No clustering will be performed.

In [81]:
#import streamlit as st
from pubmed_crawler import SinglePubMedSearcher
import os
import json
import pandas as pd
import plotly.express as px
from collections import Counter
import torch
from transformers import AutoTokenizer, AutoModel
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import numpy as np
from pprint import pprint

## IDA and EDA
In the next part we will import the scraped data and also additional information to evaluate the model and build the final plotly graph.

In [82]:
name = 'Menon_Anil'
dir_name = '/home/tom-ruge/Schreibtisch/Fachhochschule/Semester_2/Social Media Analytics/StalkYourProf/results/' + name + '/processed/'
pubmed_endpoint = 'https://pubmed.ncbi.nlm.nih.gov/'

def get_paper(dir_name):
    return [os.path.join(dir_name, file_name) for file_name in os.listdir(dir_name)]

def get_number_of_papers(dir_name):
    return len(os.listdir(dir_name))

def get_paper_data(file_names):
    results = []
    for file_name in file_names:
        with open(file_name, 'r') as file:
            data = json.load(file)
            results.append(data)
    return results

def get_abstracts_pmid(paper_data, pubmed_endpoint):
    abstracts = []
    pmids = []
    for data in paper_data:
        try:
            abstracts.extend(data['TI'])
            #print(list(data['PMID'])[0])
            pmids.append(pubmed_endpoint + list(data['PMID'])[0] + '/')
        except KeyError:
            pass
    return abstracts, pmids

file_names = get_paper(dir_name)
paper_data = get_paper_data(file_names)
abstracts, pmids = get_abstracts_pmid(paper_data, pubmed_endpoint)

print('Number of papers:', get_number_of_papers(dir_name))
print('Number of abstracts:', len(abstracts))

Number of papers: 25
Number of abstracts: 25


Not all papers on PubMead seem to contain abstracts. But most of them seem to have since it is the most important information if you are searching for appropriate papers for your needs. Now we will investigate some abstracts to get an idea of how the extracted abstracts look like:

In [83]:
print(pmids[0],': ')
pprint(abstracts[0])
print('-----------')
print(pmids[10],': ')
pprint(abstracts[10])

https://pubmed.ncbi.nlm.nih.gov/23869315/ : 
"Thymic hyperplasia in Graves' disease."
-----------
https://pubmed.ncbi.nlm.nih.gov/37457519/ : 
('Coexistence of human immunodeficiency virus infection and autoimmune '
 'hepatitis: A double trouble.')


The papers are structured different. The first displayed abstact does not contain structure which indicates 'BACKGROUND & AIMS' and etc... But the length is approx. equally long. I do not have domain knowleddge to further investigate the content. In the next step we will investigate the number of words in each abstracts.

In [84]:
def plot_word_counts(abstracts):
    # Count the number of words in each abstract
    word_counts = [len(abstract.split()) for abstract in abstracts]
    # Create a histogram of the word counts with a standard color scheme
    fig = px.histogram(
        x=word_counts, 
        title='Word Count Distribution of Abstracts',
        labels={'x': 'Word Count', 'y': 'Frequency'},  # Axis labels
        color_discrete_sequence=['blue']  # Set a standard color for the bars
    )
    # Update layout to avoid the black background
    fig.update_layout(
        plot_bgcolor='white',  # Background of the plot
        paper_bgcolor='white',  # Background of the entire figure
        title_font=dict(size=20),  # Font size of the title
        xaxis=dict(showgrid=True, gridcolor='lightgray'),  # Grid lines for the x-axis
        yaxis=dict(showgrid=True, gridcolor='lightgray')   # Grid lines for the y-axis
    )
    # Show the figure
    fig.show()

# Example usage
plot_word_counts(abstracts)


Most abstracts contain around 200 to 300 words, but some only contain very few words. In the next step we will import the biobert model:

In [85]:
# Load the BioBERT model and tokenizer
model_name = "dmis-lab/biobert-v1.1" # BioBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the tokenizer
model = AutoModel.from_pretrained(model_name) # Load the model

In [86]:
# # Function to get embeddings from a list of documents
# def get_embeddings(documents):
#     inputs = tokenizer(documents, return_tensors='pt', padding=True, truncation=True)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state[:, 0, :].numpy()

# # Sample biomedical texts (abstracts)
# documents = abstracts

# # Get embeddings
# embeddings = get_embeddings(documents)


In [87]:
# Function to get embeddings from a list of documents
def get_embeddings(documents):
    all_embeddings = []
    for i in range(0, len(documents), 8):  # Process in batches of 8 (or other sizes)
        inputs = tokenizer(documents[i:i+8], return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Mean pooling instead of just the [CLS] token
        embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings)

# Sample biomedical texts (abstracts)
documents = abstracts  # Ensure this is a valid list of abstracts

# Get embeddings
embeddings = get_embeddings(documents)

# Dimensionality reduction using PCA for visualization
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)


In [88]:
import plotly.express as px
import pandas as pd
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import webbrowser

def plot_embeddings_with_dash(embeddings, documents, urls):
    # Create a DataFrame to hold the embedding data and corresponding document info
    df = pd.DataFrame({
        'PCA Component 1': embeddings[:, 0],
        'PCA Component 2': embeddings[:, 1],
        'URL': urls
    })

    # Create the Dash app
    app = dash.Dash(__name__)

    # Define the layout
    app.layout = html.Div([
        dcc.Graph(
            id='scatter-plot',
            figure=px.scatter(
                df,
                x='PCA Component 1',
                y='PCA Component 2',
                title='BioBERT Document Embeddings with Clusters',
                hover_data={'URL': True},
                labels={'PCA Component 1': 'PCA Component 1', 'PCA Component 2': 'PCA Component 2'},
                template='plotly_white'
            ).update_traces(marker=dict(size=8, color='blue'), mode='markers+text', textposition='top center'),
            style={'width': '80%', 'height': '600px'}
        ),
        html.Div(id='output-div', style={'margin-top': '20px', 'font-size': '16px'})
    ])

    # Define the callback to handle clicks
    @app.callback(
        Output('output-div', 'children'),
        [Input('scatter-plot', 'clickData')]
    )
    def display_click_data(clickData):
        if clickData is not None:
            # Get the URL from the clicked point
            url = clickData['points'][0]['customdata'][0]  # Extract the string from the list
            # Open the URL in a new tab
            webbrowser.open_new_tab(url)
            return f'Link opened: {url}'
        return 'Click on a point to open the associated URL.'

    # Run the Dash app
    app.run_server(debug=True)

# Example usage
plot_embeddings_with_dash(reduced_embeddings, documents, pmids)

