In [1]:
!pip install arxiv semanticscholar pyalex habanero wbdata sentence-transformers joblib pandas numpy requests





[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:

import arxiv
import pandas as pd
import numpy as np
import time
import requests
import wbdata
from pyalex import Works
from habanero import Crossref
from semanticscholar import SemanticScholar
from sentence_transformers import SentenceTransformer
import joblib


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
crossref = Crossref()
sch = SemanticScholar()

def get_rnd_expenditure(country="USA"):
    """Fetch latest R&D expenditure (% of GDP) from World Bank"""
    try:
        url = f"https://api.worldbank.org/v2/country/{country}/indicator/GB.XPD.RSDV.GD.ZS?format=json"
        data = requests.get(url).json()
        return float(data[1][0]["value"])
    except Exception:
        return np.nan

def get_openalex_metadata(doi):
    """Fetch citations and fields from OpenAlex"""
    try:
        work = Works()[doi]
        return {
            "citations": work["cited_by_count"],
            "fields": [f["display_name"] for f in work["concepts"]],
        }
    except Exception:
        return {"citations": np.nan, "fields": []}

def get_crossref_data(doi):
    """Get publication year and journal info"""
    try:
        cr_data = crossref.works(ids=doi)
        pub_date = cr_data["message"]["issued"]["date-parts"][0][0]
        journal = cr_data["message"]["container-title"][0]
        return {"journal": journal, "pub_year": pub_date}
    except Exception:
        return {"journal": None, "pub_year": None}

def get_semanticscholar_citations(title):
    """Fetch citation count via Semantic Scholar"""
    try:
        paper = sch.search_paper(title)
        if paper and len(paper) > 0:
            return paper[0]["citationCount"]
        return np.nan
    except Exception:
        return np.nan


In [4]:
client = arxiv.Client()
topics = ["quantum", "superconductivity", "semiconductor"]

papers = []
for topic in topics:
    print(f"üîç Fetching papers for: {topic}")
    search = arxiv.Search(
        query=topic,
        max_results=1000,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    try:
        for result in client.results(search):
            papers.append({
                "title": result.title,
                "summary": result.summary,
                "published": result.published,
                "doi": result.doi,
                "pdf_url": result.pdf_url
            })
        print(f"‚úÖ Collected {len(papers)} papers for '{topic}'")
    except arxiv.UnexpectedEmptyPageError:
        print(f"‚ö†Ô∏è No more pages for {topic}")
    except Exception as e:
        print(f"‚ùå Error: {e}")
    time.sleep(2)

df = pd.DataFrame(papers)
print(f"üìò Total papers: {len(df)}")
df.head()


üîç Fetching papers for: quantum
‚úÖ Collected 1000 papers for 'quantum'
üîç Fetching papers for: superconductivity
‚úÖ Collected 2000 papers for 'superconductivity'
üîç Fetching papers for: semiconductor
‚úÖ Collected 3000 papers for 'semiconductor'
üìò Total papers: 3000


Unnamed: 0,title,summary,published,doi,pdf_url
0,Thermal State Simulation with Pauli and Majora...,We introduce a propagation-based approach to t...,2026-02-04 18:59:02+00:00,,https://arxiv.org/pdf/2602.04878v1
1,"Epitaxial growth optimization, measurement and...",Interface roughness scattering is an important...,2026-02-04 18:57:43+00:00,,https://arxiv.org/pdf/2602.04874v1
2,Requirements for Teleportation in an Intercity...,We investigate the hardware requirements for q...,2026-02-04 18:56:48+00:00,,https://arxiv.org/pdf/2602.04869v1
3,From Evaluation to Design: Using Potential Ene...,Machine Learning Interatomic Potentials (MLIPs...,2026-02-04 18:50:10+00:00,,https://arxiv.org/pdf/2602.04861v1
4,Digital signatures with classical shadows on n...,Quantum mechanics provides cryptographic primi...,2026-02-04 18:48:12+00:00,,https://arxiv.org/pdf/2602.04859v1


In [6]:
import concurrent.futures
import json
import os
import time
from tqdm import tqdm   # ‚úÖ Use normal tqdm (NOT notebook)

CACHE_FILE = "paper_cache.json"

# Load previous cache if it exists
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        cache = json.load(f)
else:
    cache = {}


def safe_call(func, *args, **kwargs):
    """Wrapper to safely call APIs with fallback."""
    try:
        return func(*args, **kwargs)
    except Exception:
        return {}


def enrich_paper(row):

    doi = str(row.get("doi", ""))
    title = row.get("title", "")

    # Use cached data if available
    if doi in cache:
        return cache[doi]

    crossref_meta = safe_call(get_crossref_data, doi)
    openalex_meta = safe_call(get_openalex_metadata, doi)
    rnd = safe_call(get_rnd_expenditure, "USA")
    citations = safe_call(get_semanticscholar_citations, title)

    data = {
        "journal": crossref_meta.get("journal"),
        "pub_year": crossref_meta.get("pub_year"),
        "fields": ", ".join(openalex_meta.get("fields", [])),
        "citations": citations or openalex_meta.get("citations"),
        "rnd_gdp": rnd,
    }

    cache[doi] = data
    return data


print("‚öôÔ∏è Enriching paper metadata (parallel mode)...")

extra_info = []

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:

    futures = [
        executor.submit(enrich_paper, row)
        for _, row in df.iterrows()
    ]

    for future in tqdm(
        concurrent.futures.as_completed(futures),
        total=len(futures),
        desc="Processing Papers"
    ):
        try:
            extra_info.append(future.result())
        except Exception as e:
            print("‚ö†Ô∏è Skipping one paper:", e)


# Save cache
with open(CACHE_FILE, "w") as f:
    json.dump(cache, f, indent=2)


meta_df = pd.DataFrame(extra_info)

df = pd.concat([df, meta_df], axis=1)

df.to_csv("physics_papers_enriched_fast.csv", index=False)

print("‚úÖ Metadata enrichment complete in parallel mode!")


‚öôÔ∏è Enriching paper metadata (parallel mode)...


Processing Papers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3000/3000 [36:25<00:00,  1.37it/s]


‚úÖ Metadata enrichment complete in parallel mode!


In [7]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

print("üî¢ Generating embeddings...")
embeddings = embedder.encode(df["summary"].tolist(), show_progress_bar=True)

# Combine numerical features
numeric_features = np.array([
    df["citations"].fillna(0).values,
    df["pub_year"].fillna(0).values,
    df["rnd_gdp"].fillna(0).values
]).T

X = np.hstack((embeddings, numeric_features))
print("‚úÖ Feature matrix shape:", X.shape)


üî¢ Generating embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 94/94 [02:12<00:00,  1.41s/it]

‚úÖ Feature matrix shape: (3000, 387)





In [8]:
# Ensure citations column exists and has valid numeric values
if "citations" not in df.columns:
    df["citations"] = 0

# Convert to numeric (in case some non-numeric values slipped in)
df["citations"] = pd.to_numeric(df["citations"], errors="coerce").fillna(0)

# Create robust impact label
if "impact_label" not in df.columns:
    unique_vals = df["citations"].nunique()

    # Case 1: At least 3 unique citation values ‚Üí 3 groups
    if unique_vals >= 3:
        try:
            df["impact_label"] = pd.qcut(
                df["citations"],
                q=3,
                labels=["Low", "Medium", "High"],
                duplicates="drop"
            )
        except ValueError:
            # fallback if bins collapse due to duplicates
            df["impact_label"] = pd.cut(
                df["citations"],
                bins=3,
                labels=["Low", "Medium", "High"]
            )

    # Case 2: Exactly 2 unique values ‚Üí 2 groups
    elif unique_vals == 2:
        df["impact_label"] = pd.cut(
            df["citations"],
            bins=2,
            labels=["Low", "High"]
        )

    # Case 3: All citation values identical ‚Üí default label
    else:
        df["impact_label"] = "Low"

print("‚úÖ Impact labels successfully created.")
print(df["impact_label"].value_counts())


‚úÖ Impact labels successfully created.
impact_label
Low       2999
High         1
Medium       0
Name: count, dtype: int64


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

if not "impact_label" in df.columns:
    unique_vals = df["citations"].nunique()
    if unique_vals >= 3:
        df["impact_label"] = pd.qcut(df["citations"].fillna(0), q=3,
                                     labels=["Low", "Medium", "High"], duplicates="drop")
    elif unique_vals == 2:
        df["impact_label"] = pd.cut(df["citations"].fillna(0), bins=2,
                                    labels=["Low", "High"])
    else:
        df["impact_label"] = "Low"

le = LabelEncoder()
y = le.fit_transform(df["impact_label"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# ‚úÖ Save trained model & encoder
joblib.dump(clf, "physics_paper_model.pkl")
joblib.dump(le, "label_encoder.pkl")

print("‚úÖ Model retrained and saved successfully!")


‚úÖ Model retrained and saved successfully!


In [10]:
search = arxiv.Search(id_list=["2006.12440"])
result = next(client.results(search))

title, abstract, doi = result.title, result.summary, result.doi
crossref_meta = get_crossref_data(doi)
openalex_meta = get_openalex_metadata(doi)
rnd = get_rnd_expenditure("USA")
# Embed
emb = embedder.encode([abstract])
num_feats = np.array([[openalex_meta["citations"] or 0,
                       crossref_meta["pub_year"] or 0,
                       rnd or 0]])

X_new = np.hstack((emb, num_feats))
clf = joblib.load("physics_paper_model.pkl")
le = joblib.load("label_encoder.pkl")

pred = le.inverse_transform(clf.predict(X_new))[0]
print(f"\nüìò Title: {title}")
print(f"üîÆ Predicted Future Potential: {pred}")



üìò Title: A polynomial time and space heuristic algorithm for T-count
üîÆ Predicted Future Potential: Low
