## T-SNE plots for representing the embedding space
##### This notebook generates dense vector embeddings for academic papers using the SPECTER2 model. The embeddings capture semantic similarity between papers based on their titles and abstracts, enabling downstream tasks and analysis.

##### This notebook outlines the code for visualizing the dense vector embeddings for academic papers generated using the SPECTER2 model. The embeddings capture semantic similarity between papers based on their titles and abstracts, enabling downstream tasks and analysis. We include the t-sne plots for the top 5 most cited topics in 10% of each of the domains, i.e., AI, physics and psychology

**Model**: SPECTER2 - A transformer-based model from Allen AI, specifically trained on scientific documents for document-level representation learning.

**Input:**

- S2_papers_cleaned.db - Cleaned Semantic Scholar papers with titles and abstracts (from clean_and_merge_dbs.ipynb) or downloadable from Hugging Face Hub: lalit3c/S2_CS_PHY_PYSCH_papers
- all embedding files, denoted as embeddings_< number >.db - downloadable from Hugging Face Hub: lalit3c/S2_CS_PHY_PYSCH_papers/embeddings

**Output**:
plots for top 5 most cited topics in AI, physics and psychology

#### 1. Set up the environment and import the required packages

In [1]:
#download the packages

import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import colors
import openTSNE

#### 2. Connect to the S2_papers_cleaned.db database and the respective embedding files

In [2]:
# connect to the database

dbfile = 'S2_papers_cleaned.db'
# Create a SQL connection to our SQLite database

con = duckdb.connect(dbfile)
df_papers = con.execute("SELECT * FROM papers_with_abstracts").fetchdf()
print("Shape for the database:", df_papers.shape)

Shape for the database: (2356819, 10)


In [3]:
# Create a SQL connection to the embedding files 

dbfile = 'data/embeddings/embeddings_1.db'
db_file_2 = 'data/embeddings/embeddings_2.db'
db_file_3 = 'data/embeddings/embeddings_3.db'



con = duckdb.connect(dbfile)
df = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

con = duckdb.connect(db_file_2)
df_2 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()


con = duckdb.connect(db_file_3)
df_3 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()


In [4]:
db_file_4 = 'data/embeddings/embeddings_4.db'
con = duckdb.connect(db_file_4)
df_4 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

In [5]:
db_file_5 = 'data/embeddings/embeddings_5.db'
con = duckdb.connect(db_file_5)
df_5 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

# db_file_6 = 'data/embeddings/embeddings_6.db'
# con = duckdb.connect(db_file_6)
# df_6 = con.execute("SELECT * FROM embeddings").fetchdf()
# con.close()

db_file_7 = 'data/embeddings/embeddings_7.db'
con = duckdb.connect(db_file_7)
df_7 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_8 = 'data/embeddings/embeddings_8.db'
con = duckdb.connect(db_file_8)
df_8 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_9 = 'data/embeddings/embeddings_9.db'
con = duckdb.connect(db_file_9)
df_9 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_10 = 'data/embeddings/embeddings_10.db'
con = duckdb.connect(db_file_10)
df_10 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_11 = 'data/embeddings/embeddings_11.db'
con = duckdb.connect(db_file_11)
df_11 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_12 = 'data/embeddings/embeddings_12.db'
con = duckdb.connect(db_file_12)
df_12 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_13 = 'data/embeddings/embeddings_13.db'
con = duckdb.connect(db_file_13)
df_13 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_14 = 'data/embeddings/embeddings_14.db'
con = duckdb.connect(db_file_14)
df_14 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_15 = 'data/embeddings/embeddings_15.db'
con = duckdb.connect(db_file_15)
df_15 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_16 = 'data/embeddings/embeddings_16.db'
con = duckdb.connect(db_file_16)
df_16 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_17 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_17)
df_17 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_18 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_18)
df_18 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_19 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_19)
df_19 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_20 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_20)
df_20 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_21 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_21)
df_21 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_22 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_22)
df_22 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_23 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_23)
df_23 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_24 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_24)
df_24 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_25 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_25)
df_25 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_26 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_26)
df_26 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_27 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_27)
df_27 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_28 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_28)
df_28 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_29 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_29)
df_29 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

db_file_30 = 'data/embeddings/embeddings_17.db'
con = duckdb.connect(db_file_30)
df_30 = con.execute("SELECT * FROM embeddings").fetchdf()
con.close()

##### 3. Separate out papers from the database, and merge with embeddings

In [6]:
# seperate out papers from the database according to the field 

is_ai = df_papers[df_papers['is_ai'] == True]
is_psych = df_papers[df_papers['is_psych'] == True]
is_physics = df_papers[df_papers['is_physics'] == True]

In [7]:
# arrange in descending order of citation count

is_ai = is_ai.sort_values("citation_count", ascending = False).drop(columns = ['is_psych', "is_physics"])
is_psych = is_psych.sort_values("citation_count", ascending = False).drop(columns = ['is_ai', 'is_physics'])
is_physics = is_physics.sort_values("citation_count", ascending = False).drop(columns = ['is_ai', 'is_psych'])

In [9]:
# extract 10% of the top cited papers

is_ai = is_ai.head(int(len(is_ai)*0.1))
is_ai.shape

(5365, 8)

In [10]:
is_ai = is_ai.sort_values("citation_count", ascending = False)
is_ai.head()

Unnamed: 0,corpusid,title,abstract,publication_date,citation_count,influential_citation_count,primary_topic,is_ai
82332,206594692,Deep Residual Learning for Image Recognition,Deeper neural networks are more difficult to t...,2015-12-10,212958,31078,Advanced Neural Network Applications,True
1825790,14124313,Very Deep Convolutional Networks for Large-Sca...,In this work we investigate the effect of the ...,2014-09-04,107085,14296,Advanced Vision and Imaging,True
2346053,10328909,Faster R-CNN: Towards Real-Time Object Detecti...,State-of-the-art object detection networks dep...,2015-06-04,68158,9460,Advanced Neural Network Applications,True
1754328,225039882,An Image is Worth 16x16 Words: Transformers fo...,While the Transformer architecture has become ...,2020-10-22,52559,6208,Advanced Neural Network Applications,True
1651958,206592484,Going deeper with convolutions,We propose a deep convolutional neural network...,2014-09-16,45792,4358,Advanced Neural Network Applications,True


In [11]:
# group by the topics

top_ai = is_ai.groupby('primary_topic').sum()


In [46]:
top_ai = top_ai.sort_values("citation_count", ascending = False)
top_ai = top_ai.head()
top_ai.shape

(5, 7)

In [13]:
top_ai

Unnamed: 0_level_0,corpusid,title,abstract,publication_date,citation_count,influential_citation_count,is_ai
primary_topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AI in Service Interactions,6288987120487198722881122862882674204427106216...,Brave new world: service robots in the frontli...,\nPurpose\nThe service sector is at an inflect...,2018-09-262019-10-102020-11-042017-02-012019-0...,16514,876,30
AI in cancer detection,1411087527391290131593015487974237132240070756...,Applications of Machine Learning in Cancer Pre...,Machine learning is a branch of artificial int...,2006-01-012016-01-012017-09-042014-05-062016-0...,34291,2086,58
AI-based Problem Solving and Planning,6552475173051914873271100257537065964571404691...,The FF Planning System: Fast Plan Generation T...,We describe and evaluate the algorithmic techn...,2011-06-032006-05-012003-12-012000-08-012010-0...,11906,1374,16
Advanced Clustering Algorithms Research,2810845623137218597833734610919618954660672248...,Cluster AnalysisLearning Deconvolution Network...,ەی اكتره . Abstract This study was conducted w...,2002-07-052015-05-172020-04-202016-02-122019-0...,24899,1519,28
Advanced Computational Techniques and Applications,14239522,Hierarchical text classification and evaluation,Hierarchical classification refers to the assi...,2001-11-29,465,26,1
...,...,...,...,...,...,...,...
Video Analysis and Summarization,2585642642585883062272277481522824739077918192...,ImageBind One Embedding Space to Bind Them All...,"We present ImageBind, an approach to learn a j...",2023-05-092023-05-102020-11-302019-05-122004-0...,6847,986,13
Video Surveillance and Tracking Methods,5029465378407745062376216034699522558401495816...,The Cityscapes Dataset for Semantic Urban Scen...,Visual understanding of complex urban street s...,2016-04-062014-04-292017-03-212000-08-012016-0...,143003,20471,181
Virtual Reality Applications and Impacts,1873012314986536915531941532161082321196142372...,Virtual Reality: How Much Immersion Is Enough?...,Solid evidence of virtual reality's benefits h...,2007-07-012018-12-262010-05-122018-11-062021-0...,17176,984,33
Visual Attention and Saliency Detection,9118439117448525293879719499342441178626717725...,Res2Net: A New Multi-Scale Backbone Architectu...,Representing features at multiple scales is of...,2019-04-022010-06-132018-10-082016-06-012021-1...,66864,8299,92


In [14]:
top_topics = list(top_ai.index)

In [15]:
# filter out papers that are within these topics only from the top 10% of highly cited papers

top_papers = is_ai.loc[is_ai['primary_topic'].isin(top_topics)]
top_papers.shape

(5365, 8)

#### 4. Merge the embeddings with the abstracts

In [17]:
# concatenate the embedding dataframes row-wise (just to bring them together)

df_all = pd.concat([df, df_2, df_3, df_4, df_5, df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14,
                df_15, df_16, df_17, df_18, df_19, df_20, df_21, df_22, df_23,
               df_24, df_25, df_26, df_27, df_28, df_29, df_30], axis = 0)
df_all.shape

(2900000, 2)

In [19]:
# merge the embeddings and the paper details based on corpusID to get the titles and other details

merged_df = df_all.merge(top_papers, on='corpusid', how="inner")
merged_df.shape

(16290, 9)

In [20]:
merged_df.head()

Unnamed: 0,corpusid,embedding,title,abstract,publication_date,citation_count,influential_citation_count,primary_topic,is_ai
0,10004443,"[-0.060397398, 0.8711522, -0.71880436, -0.1621...",Learning Topic Models -- Going beyond SVD,Topic Modeling is an approach used for automat...,2012-04-09,445,73,Topic Modeling,True
1,10023884,"[-0.12287163, 0.4946351, -0.7267563, -0.237625...",A hierarchical method to automatically encode ...,BackgroundThe accumulation of medical document...,2016-03-03,45,2,Topic Modeling,True
2,10027965,"[0.1910211, 0.12633817, -0.21880807, -0.943105...",Dual-Glance Model for Deciphering Social Relat...,"Since the beginning of early civilizations, so...",2017-08-02,90,25,Multimodal Machine Learning Applications,True
3,10028211,"[0.098660916, 0.2287176, -0.3126847, -0.347239...",Jointly optimizing word representations for le...,"We introduce C-PHRASE, a distributional semant...",2015-07-01,48,8,Topic Modeling,True
4,1003611,"[-0.47621238, -0.19502091, -0.64749724, -0.526...",A Relational Model of Semantic Similarity betw...,Semantic similarity is a central concept that ...,2009-08-06,56,2,Topic Modeling,True


In [21]:
# turn the embeddings into an array of (n_smaples, n_features)

k = []
for i in range(len(merged_df['embedding'])):
  k.append(np.array(merged_df['embedding'][i]))
k = np.array(k)
k.shape

(16290, 768)

In [22]:
# fit the TSNE plot and display

tsne = openTSNE.TSNE(
    perplexity=30,metric="euclidean", n_jobs=-1, random_state=42, verbose = 1
)

emb = tsne.fit(k)

--------------------------------------------------------------------------------
TSNE(early_exaggeration=12, n_jobs=-1, random_state=42, verbose=1)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 7.54 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 1.44 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 0.37 seconds
===> Running optimization with exaggeration=12.00, lr=1357.50 for 250 iterations...
Iteration   50, KL divergence 4.6273, 50 iterations in 1.2804 sec
Iteration  100, KL divergence 4.6275, 50 iterations in 1.4187 sec
Iteration  150, KL divergence 4.6044, 50 iterations in 1.3904 sec
Iteration  200, KL divergence 4.5958, 50 iterations in 1.3864 sec
Iteration  250, KL divergence 4.5927, 50 iterations in 1.3921 sec
   --> Time elapsed: 6.87 seconds
===> Running optimization with exaggeration=1.0

In [23]:
labels = merged_df['primary_topic']
labels

0                                  Topic Modeling
1                                  Topic Modeling
2        Multimodal Machine Learning Applications
3                                  Topic Modeling
4                                  Topic Modeling
                           ...                   
16285                 Advanced Vision and Imaging
16286                              Topic Modeling
16287                              Topic Modeling
16288                              Topic Modeling
16289        Advanced Neural Network Applications
Name: primary_topic, Length: 16290, dtype: str

In [37]:
df = merged_df.assign(label=labels_clean)

# ensure datetime
df["date"] = pd.to_datetime(df["publication_date"], errors="coerce")

# numeric time (seconds since epoch)
time_num = df["date"].astype("int64") / 1e9

t_min, t_max = time_num.min(), time_num.max()

# normalize to [0, 1]
t_norm = (time_num - t_min) / (t_max - t_min)


In [38]:
# older → larger
marker_size = 4 + 10 * (1 - t_norm)

# older → fainter
marker_opacity = 0.9 - 0.6 * (1 - t_norm)


In [39]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

labels_clean = (
    pd.Series(labels)
    .astype(str)
    .str.strip()
    .str.lower()
)

df = merged_df.assign(label=labels_clean)

# parse date column
df["date"] = pd.to_datetime(df["publication_date"], errors="coerce")

# normalize time
time_num = df["date"].astype("int64") / 1e9
t_min, t_max = time_num.min(), time_num.max()
t_norm = (time_num - t_min) / (t_max - t_min)

# older → larger & fainter
sizes = 6 + 14 * (1 - t_norm)
opacities = 0.85 - 0.55 * (1 - t_norm)

fig_tsne = go.Figure()

for lab, group in df.groupby("label"):
    idx = group.index

    fig_tsne.add_trace(
        go.Scatter(
            x=emb[idx, 0],
            y=emb[idx, 1],
            mode="markers",
            name=lab,  # legend entry
            marker=dict(
                size=sizes[idx],
                opacity=opacities[idx],
            ),
            text=group["title"],
            customdata=group["date"],
            hovertemplate=(
                "<b>%{text}</b><br>"
                "t-SNE1: %{x:.2f}<br>"
                "t-SNE2: %{y:.2f}<br>"
                "year: %{customdata|%Y}<br>"
                f"topic: {lab}"
                "<extra></extra>"
            ),
        )
    )


In [43]:
fig_tsne.update_layout(
    autosize=False,
    width=1000,
    height=600,
    title="t-SNE: Top 5 most cited topics in AI in the embedding space.<br> <sup> (Marker size decreases with increasing year of publication) </sup>",
    xaxis_title="t-SNE1",
    yaxis_title="t-SNE2",
    legend_title="Topic",
)


: 

: 