In [74]:
import pandas as pd
import numpy as np
import gdown


# 📁 Download CSV
gdown.download("https://drive.google.com/uc?id=13fdkm8OqrzcxB93ADhFw-VWt95C-__XG", "arxiv_tokenized_balanced.csv", quiet=False)


# 📁 Download NPY
gdown.download("https://drive.google.com/uc?id=13j3Lx5-qmByx4HYL4K2jIlP38Y1xXhmk", "arxiv_minilm_embeddings.npy", quiet=False)


# 📊 Load data
df = pd.read_csv("arxiv_tokenized_balanced.csv")
embeddings = np.load("arxiv_minilm_embeddings.npy")


Downloading...
From (original): https://drive.google.com/uc?id=13fdkm8OqrzcxB93ADhFw-VWt95C-__XG
From (redirected): https://drive.google.com/uc?id=13fdkm8OqrzcxB93ADhFw-VWt95C-__XG&confirm=t&uuid=6d59ce06-b562-4e2b-9d10-dbcd58204d9d
To: /content/arxiv_tokenized_balanced.csv
100%|██████████| 715M/715M [00:09<00:00, 73.0MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=13j3Lx5-qmByx4HYL4K2jIlP38Y1xXhmk
From (redirected): https://drive.google.com/uc?id=13j3Lx5-qmByx4HYL4K2jIlP38Y1xXhmk&confirm=t&uuid=0a0760f2-4a28-4a51-9179-9223f2f20a28
To: /content/arxiv_minilm_embeddings.npy
100%|██████████| 174M/174M [00:01<00:00, 137MB/s]


In [75]:
df.head()
df.columns


Index(['id', 'title', 'abstract', 'submitter', 'authors', 'comments',
       'journal-ref', 'doi', 'report-no', 'categories', 'license',
       'update_date', 'authors_parsed', 'main_category', 'combined_text',
       'input_ids', 'attention_mask'],
      dtype='object')

In [76]:
df.shape[0]


113463

In [77]:
# Clean title and abstract columns
df['title'] = df['title'].astype(str).str.replace('\n', ' ').str.strip()
df['abstract'] = df['abstract'].astype(str).str.replace('\n', ' ').str.strip()
df['authors'] = df['authors'].astype(str).str.replace('\n', ' ').str.strip()



In [78]:
from sklearn.metrics.pairwise import cosine_similarity

# Example: get top matches for a selected paper index
def recommend_by_index(idx, top_n=None, min_similarity=0.3):
    sim_scores = list(enumerate(cosine_similarity([embeddings[idx]], embeddings)[0]))
    sim_scores = [(i, score) for i, score in sim_scores if i != idx and score >= min_similarity]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:top_n]
    return [(df['title'][i], score) for i, score in sim_scores]


In [79]:
def recommend(title, top_n=10, min_similarity=0.5):
    try:
        idx = df[df['title'] == title].index[0]
        return recommend_by_index(idx, top_n, min_similarity)
    except IndexError:
        return []


In [80]:
# Step 7: Try it!
#paper = 'Deep Learning'
#print(f"📚 Recommendations for: {paper}")
#for rec in recommend(paper):
    #print(f"➡️ {rec[0]} (Similarity: {rec[1]:.2f})")


In [82]:
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

# Preprocessing — only needs to be done once
df['title_lower'] = df['title'].str.lower()

# Input widget
search_input = widgets.Text(
    placeholder='Enter a paper title or keyword...',
    description='Search:',
    layout=widgets.Layout(width='75%'),
    style={'description_width': 'initial'}
)

search_button = widgets.Button(description="Recommend Papers", button_style='primary')
search_output = widgets.Output()


# Click handler
def on_search_click(b):
    with search_output:
        clear_output()
        query = search_input.value.strip().lower()

        if len(query) < 3:
            display(Markdown("⚠️ Please enter at least 3 characters."))
            return

        # FAST match
        match = df[df['title_lower'].str.contains(query, na=False)]

        if not match.empty:
            matched_title = match.iloc[0]['title']
            results = recommend(
                title=matched_title,

                top_n=10,
                min_similarity=0.5
            )
            display(Markdown(f"### 📚 Recommendations for: *{search_input.value.strip()}*"))

            if results:
                for title, score in results:
                    display(Markdown(f"➡️ **{title}**  \n*Similarity:* `{score:.2f}`"))
            else:
                display(Markdown("⚠️ No relevant papers found above the similarity threshold."))
        else:
            display(Markdown("❌ No paper title contains those keywords."))

search_button.on_click(on_search_click)

# Show UI
display(Markdown("## 🔍 Paper Recommender System"))
display(search_input, search_button, search_output)


## 🔍 Paper Recommender System

Text(value='', description='Search:', layout=Layout(width='75%'), placeholder='Enter a paper title or keyword.…

Button(button_style='primary', description='Recommend Papers', style=ButtonStyle())

Output()