In [0]:
import numpy as np
import pandas as pd
import plotly.express as px
from pyspark.sql.functions import min, max, count_distinct


# Visualization

In this notebook, we will visualize the ArXiv papers. 


We have created the embeddings and the clusters in another notebook and stored the results in Delta table `main.default.ARXIV_ANALYSIS`.

In [0]:
df = spark.read.table("main.default.ARXIV_ANALYSIS")
year_min = df.select(min('year').alias('minimum_year')).limit(1).collect()[0].minimum_year
year_max = df.select(max('year').alias('maximum_year')).limit(1).collect()[0].maximum_year
cluster_count = df.select(count_distinct('prediction').alias('cluster_count')).limit(1).collect()[0].cluster_count
print(f"Minimum year: {year_min}, maximum year: {year_max}. Number of clusters: {cluster_count}.")

Minimum year: 2007, maximum year: 2023. Number of clusters: 8.


In [0]:
color_map = px.colors.qualitative.Dark2
assert len(color_map) >= cluster_count, "Not enough colors!"

In [0]:
# To visualize, we can sample from the total set
visualization_sample_limit = 2000
fraction = visualization_sample_limit / df.count()
sample_df = df.sample(fraction)
sample_df.count()

1996

In [0]:
rows = sample_df.collect()
pca_features = np.asarray([row.pca_features for row in rows])
article_titles = [row.title for row in rows]
cluster_ids = [str(row.prediction) for row in rows]
cluster_colors = [color_map[int(row.prediction)] for row in rows]
years = [row.year for row in rows]
visualization_df = pd.DataFrame({
    'title': article_titles,
    'cluster': cluster_ids,
    'pca_feature_1': pca_features[:, 0],
    'pca_feature_2': pca_features[:, 1],
    'year': years
})

In [0]:
visualization_df2 = pd.DataFrame()
rows = []
for year in range(year_min, year_max + 1):
    for i, row in visualization_df.iterrows():
        if row['year'] <= year:
            modified_row = row[:]
            modified_row['display_year'] = year
            rows.append(modified_row)
visualization_df2 = pd.DataFrame(rows)

In [0]:
visualization_df2 = visualization_df2.sort_values(by='display_year')
display(visualization_df2)

title,cluster,pca_feature_1,pca_feature_2,year,display_year
Discovery of High-Frequency QPOs in Black Hole Candidate XTE J1859+226,5.0,0.5295065459666348,0.271851714510067,2007.0,2007.0
Sums of entire functions having only real zeros,3.0,-0.5883920323583343,0.3631534243561963,2007.0,2007.0
An optimization problem with volume constrain for a degenerate  quasilinear operator,3.0,-0.5558865855245858,0.36971213429788,2007.0,2007.0
"Differential equations driven by H\""{o}lder continuous functions of  order greater than 1/2",6.0,-0.3089391556573877,0.4154543001874802,2007.0,2007.0
Landstad's characterization for full crossed products,3.0,-0.5237507769897949,0.4721965102580533,2007.0,2007.0
Metric attractors for smooth unimodal maps,7.0,-0.5425608703173785,0.3872348067602241,2007.0,2007.0
Computing the Ehrhart quasi-polynomial of a rational simplex,3.0,-0.7012234131498642,0.2922009204715765,2007.0,2007.0
Existence of Gorenstein projective resolutions,3.0,-0.6340147498010467,0.2666462368201613,2007.0,2007.0
Linear Transports along Paths in Vector Bundles. I. General Theory,3.0,-0.458430016576051,0.5245712503436206,2007.0,2007.0
On harmonic and asymptotically harmonic homogeneous spaces,3.0,-0.5988565135983388,0.4604375117805237,2007.0,2007.0


In [0]:
fig = px.scatter(
    visualization_df2,
    x = 'pca_feature_1', 
    y = 'pca_feature_2', 
    color = 'cluster',
    hover_name = 'title',
    animation_frame = 'display_year',
    animation_group = 'title',
    color_discrete_sequence = color_map,
    range_x=[-1, 1],
    range_y=[-1, 1],
    hover_data = {'display_year': False, 'year': True}
)
fig.update_traces(marker=dict(size=8,
                              opacity=0.6,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()


As we can see, the different subfields are quite nicely separated in the PCA space.