Read in dataset with keywords and authors.
Compute similarity of a given paper to all others via following steps:

+ Jaccard sim of keywords
+ Jaccard sim of authors
+ sum these 2

Return top *n* most similar papers to starting paper

In [29]:
import pandas as pd
import numpy as np

original_dataset = pd.read_csv("cleaned_url_data.csv", dtype={"Authors": str, "KMeansTags": object})
dataset = original_dataset[["Authors", "KMeansTags"]]
dataset

Unnamed: 0,Authors,KMeansTags
0,"Pablo Castro Rivadeneira, Kim Stachenfeld, Kev...","['programs', 'reward', 'rl', 'tasks', 'models']"
1,"Xiao Wang, Ibrahim Alabdulmohsin, Daniel Salz,...","['scale', 'vision', 'image', 'tasks', 'embeddi..."
2,"Simon Schmitt, John Shawe-Taylor, Hado van Has...","['neural', 'uncertainty', 'delta', 'weather', ..."
3,"Sishuai Gong, Wang Rui, Deniz Altinbüken, Pedr...","['kernel', 'snowplow', 'coverage', 'program', ..."
4,"Kevis-Kokitsi Maninis, Kaifeng Chen, Soham Gho...","['image', 'tasks', 'vision', 'video', 'scale']"
...,...,...
273,"Jason Milionis, Christos Papadimitriou, Georg...","['game', 'nash', 'games', 'equilibria', 'regret']"
274,"AJ Piergiovanni, Isaac Noble, Dahun Kim, Micha...","['video', 'audio', 'vision', 'point', 'models']"
275,"David Lindner, János Kramár, Sebastian Farquha...","['programs', 'cache', 'ai', 'users', 'reasoning']"
276,"Meredith Ringel Morris, Jascha Sohl-dickstein,...","['agi', 'ai', 'hci', 'evaluation', 'intelligen..."


In [30]:
def str_to_auth_list(s: str, delimiter = ","):
    authors = s.split(delimiter)
    authors = [author.strip() for author in authors]
    return authors

In [31]:
dataset.loc[:,"Authors"] = [str_to_auth_list(authors) for authors in dataset["Authors"]]
dataset.loc[:,"KMeansTags"] = [str_to_auth_list(tags[2:-2], delimiter="', '") for tags in dataset["KMeansTags"]]
dataset["KMeansTags"]

0                [programs, reward, rl, tasks, models]
1             [scale, vision, image, tasks, embedding]
2      [neural, uncertainty, delta, weather, networks]
3            [kernel, snowplow, coverage, program, ml]
4                 [image, tasks, vision, video, scale]
                            ...                       
273            [game, nash, games, equilibria, regret]
274              [video, audio, vision, point, models]
275            [programs, cache, ai, users, reasoning]
276           [agi, ai, hci, evaluation, intelligence]
277       [comedy, ai, participants, dramatron, group]
Name: KMeansTags, Length: 278, dtype: object

In [32]:
def n_most_similar_papers(ref_idx = 0, n = 10):
    '''
    returns list of tuples: (paper_index, similarity score)
    '''
    paper_count = dataset.shape[0]

    assert ref_idx < paper_count
    assert n <= paper_count

    ref_authors = set(dataset["Authors"][ref_idx])
    ref_kw = set(dataset["KMeansTags"][ref_idx])
    
    similarities = []
    for paper in range(paper_count):
        authors_set = set(dataset["Authors"][paper])
        kw_set = set(dataset["KMeansTags"][paper])
        
        authors_intersection = authors_set.intersection(ref_authors)
        authors_union = authors_set.union(ref_authors)

        kw_intersection = kw_set.intersection(ref_kw)
        kw_union = kw_set.union(ref_kw)

        similarity_score = len(authors_intersection)/len(authors_union) + len(kw_intersection)/len(kw_union)
        similarities.append(similarity_score)

    similarity_tups = list(zip(range(paper_count), similarities))
    similarity_tups = [tup for tup in similarity_tups if tup[1] != 2] # remove identical papers 
    similarity_tups.sort(key = lambda tup: tup[1], reverse=True)
    return similarity_tups[:n]

In [38]:
ref_idx = 1
res = n_most_similar_papers(ref_idx)
indices = [i for i, _ in res]

print(f"Recommendations for paper {ref_idx} ({original_dataset.loc[ref_idx, "Title"]}):")
original_dataset.iloc[indices]

Recommendations for paper 1 (Scaling Pre-training to One Hundred Billion Data for Vision Language Models):


Unnamed: 0,URL,HTML,Title,Abstract,Publish_date,Authors,KMeansTags
4,https://deepmind.google/research/publications/...,TIPS: Text-Image Pretraining with Spatial awar...,Whisk with Veo 2 15 April 2025 ...,While image-text representation learning has b...,10 March 2025,"Kevis-Kokitsi Maninis, Kaifeng Chen, Soham Gho...","['image', 'tasks', 'vision', 'video', 'scale']"
204,http://deepmind.google/research/publications/1...,TIPS: Text-Image Pretraining with Spatial awar...,Whisk with Veo 2 15 April 2025 ...,While image-text representation learning has b...,10 March 2025,"Kevis-Kokitsi Maninis, Kaifeng Chen, Soham Gho...","['image', 'tasks', 'vision', 'video', 'scale']"
7,https://deepmind.google/research/publications/...,Are vision-language models shape or texture bi...,Are vision-language models shape or texture bi...,"Unlike traditional vision-only models, vision ...",22 January 2025,"Paul Gavrikov, Jovita Lukasik, Steffen Jung, R...","['vision', 'scale', 'image', 'visual', 'models']"
115,https://deepmind.google/research/publications/...,Unsupervised Keypoints with Stable Diffusion -...,Whisk with Veo 2 15 April 2025 ...,We present an innovative approach to infer sem...,29 November 2023,"Eric Hedlin, Gopal Sharma, Shweta Mahajan, Hos...","['image', 'video', 'tasks', 'vision', 'neural']"
121,https://deepmind.google/research/publications/...,Towards In-context Scene Understanding - Googl...,Gemini and Whisk with Veo 2 15 April 2025 ...,In-context learning—the ability to configure a...,10 December 2023,"Ivana Balazevic, David Steiner, Nikhil Parthas...","['image', 'tasks', 'models', 'vision', 'video']"
199,http://deepmind.google/research/publications/8...,Are vision-language models shape or texture bi...,Are vision-language models shape or texture bi...,"Unlike traditional vision-only models, vision ...",22 January 2025,"Paul Gavrikov, Jovita Lukasik, Steffen Jung, R...","['vision', 'scale', 'image', 'visual', 'models']"
122,https://deepmind.google/research/publications/...,SODA: Bottleneck Diffusion Models for Represen...,Veo 2 15 April 2025 SODA: Bo...,"We introduce SODA, a self-supervised diffusion...",29 November 2023,"Drew A. Hudson, Daniel Zoran, Mateusz Malinows...","['image', 'neural', 'scene', 'video', 'tasks']"
134,https://deepmind.google/research/publications/...,MingOfficial: A Ming Official Career Dataset a...,A Ming Official Career Dataset and a Historica...,"In Chinese studies, understanding the nuanced ...",6 December 2023,"You-Jun Chen*, Hsin-Yi Hsieh*, Yu-Tung Lin*, Y...","['embedding', 'retrieval', 'language', 'scale'..."
157,https://deepmind.google/research/publications/...,Improving neural network representations using...,Improving neural network representations using...,Deep neural networks have reached human-level ...,10 December 2023,"Lukas Muttenthaler, Lorenz Linhardt, Jonas Dip...","['neural', 'image', 'vision', 'representations..."
171,https://deepmind.google/research/publications/...,Self-supervised video pretraining yields stron...,April 2025 Self-supervised v...,Humans learn powerful representations of objec...,29 February 2024,"Nikhil Parthasarathy, Ali Eslami, Joao Carreir...","['video', 'image', 'tasks', 'models', 'scene']"
