# 📚 Scholar.py — Google Scholar Data Extraction

Upload `scholar.py` to Colab, then run these cells.

**Features:**
- Profile & citation metrics
- Publication lists with pagination
- Citation history (scholar-level & per-article)
- Compare multiple scholars
- Predict future h-index
- Co-author network analysis & visualization

## 0. Setup

In [None]:
# Install dependencies
!pip install requests beautifulsoup4 pandas numpy networkx matplotlib -q

In [None]:
# Upload scholar.py — run this cell and select the file
from google.colab import files
uploaded = files.upload()  # select scholar.py

In [None]:
# Import everything
from scholar import *
import pandas as pd
import matplotlib.pyplot as plt
import time

print("✅ scholar.py loaded successfully")

## 1. Set Your Scholar ID

Find your Google Scholar ID from your profile URL:  
`https://scholar.google.com/citations?user=XXXXXXX` → the `XXXXXXX` part is your ID.

In [None]:
# ═══════════════════════════════════════════════
# 🔧 CHANGE THIS to your own Scholar ID
# ═══════════════════════════════════════════════
SCHOLAR_ID = "mG4imMEAAAAJ"  # Demo: Andrew Ng


## 2. Profile & Metrics

In [None]:
# ── Quick summary ────────────────────────────────
print(scholar_summary(SCHOLAR_ID))

In [None]:
# ── Full profile dict ────────────────────────────
profile = get_profile(SCHOLAR_ID)

for k, v in profile.items():
    if k != "coauthors":
        print(f"  {k:16s}: {v}")
print(f"  {'coauthors':16s}: {len(profile['coauthors'])} listed")

In [None]:
# ── Profile coauthors table ──────────────────────
if profile["coauthors"]:
    pd.DataFrame(profile["coauthors"])

## 3. Publications

In [None]:
# ── Get all publications ─────────────────────────
pubs = get_publications(SCHOLAR_ID)
print(f"Total publications: {len(pubs)}")
pubs.head(15)

In [None]:
# ── Sort by year ─────────────────────────────────
pubs_by_year = get_publications(SCHOLAR_ID, sortby="year")
pubs_by_year[["title", "year", "cites"]].head(10)

In [None]:
# ── Quick stats ──────────────────────────────────
print(f"Number of articles:          {get_num_articles(SCHOLAR_ID)}")
print(f"Distinct journals:           {get_num_distinct_journals(SCHOLAR_ID)}")
print(f"Oldest article year:         {get_oldest_article(SCHOLAR_ID)}")
print(f"Publications in top journals: {get_num_top_journals(SCHOLAR_ID)}")

In [None]:
# ── Top cited publications ───────────────────────
top = pubs.nlargest(10, "cites")[["title", "journal", "cites", "year"]]
top.style.bar(subset=["cites"], color="#4CAF50")

## 4. Publication Details

In [None]:
# ── Pick the most-cited publication ───────────────
top_pub = pubs.nlargest(1, "cites").iloc[0]
pub_id = top_pub["pubid"]
print(f"Title: {top_pub['title']}")
print(f"Pub ID: {pub_id}")
print(f"Scholar URL: {get_article_scholar_url(SCHOLAR_ID, pub_id)}")

time.sleep(1)  # rate limit courtesy

In [None]:
# ── Abstract ──────────────────────────────────────
abstract = get_publication_abstract(SCHOLAR_ID, pub_id)
print("Abstract:")
print(abstract if abstract else "(not available)")

time.sleep(1)

In [None]:
# ── Full metadata ─────────────────────────────────
meta = get_publication_data_extended(SCHOLAR_ID, pub_id)
for k, v in meta.items():
    print(f"  {k}: {v}")

time.sleep(1)

In [None]:
# ── Publication date & URL ────────────────────────
print(f"Publication date: {get_publication_date(SCHOLAR_ID, pub_id)}")
time.sleep(1)
print(f"Full-text URL:    {get_publication_url(SCHOLAR_ID, pub_id)}")

In [None]:
# ── Complete author list (with initials) ──────────
authors = get_complete_authors(SCHOLAR_ID, pub_id)
print(f"Authors: {authors}")

## 5. Citation History

In [None]:
# ── Scholar-level citation history ────────────────
hist = get_citation_history(SCHOLAR_ID)

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(hist["year"], hist["cites"], color="#2196F3", edgecolor="white")
ax.set_xlabel("Year")
ax.set_ylabel("Citations")
ax.set_title(f"Annual Citations — {profile['name']}")
plt.tight_layout()
plt.show()
hist

In [None]:
# ── Per-article citation history ──────────────────
time.sleep(1)
article_hist = get_article_cite_history(SCHOLAR_ID, pub_id)

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(article_hist["year"], article_hist["cites"], color="#FF9800", edgecolor="white")
ax.set_xlabel("Year")
ax.set_ylabel("Citations")
ax.set_title(f"Citation History — {top_pub['title'][:60]}...")
plt.tight_layout()
plt.show()
article_hist

## 6. Author Position Analysis

In [None]:
# ── Where does the author appear in author lists? ─
positions = author_position(pubs["author"], profile["name"])

# Summary
valid = positions.dropna(subset=["position"])
print(f"Found in {len(valid)}/{len(positions)} author lists")
if len(valid) > 0:
    print(f"First author:  {(valid['position'] == 1).sum()} times")
    print(f"Last author:   {(valid['position'] == valid['n_authors']).sum()} times")
    print(f"Mean position: {valid['position_normalized'].mean():.2f} (0=first, 1=last)")

positions.head(10)

In [None]:
# ── Visualize position distribution ───────────────
if len(valid) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    axes[0].hist(valid["position"].dropna(), bins=range(1, int(valid["position"].max()) + 2),
                 color="#9C27B0", edgecolor="white", alpha=0.8)
    axes[0].set_xlabel("Author Position")
    axes[0].set_ylabel("Count")
    axes[0].set_title("Absolute Position")

    axes[1].hist(valid["position_normalized"].dropna(), bins=20,
                 color="#009688", edgecolor="white", alpha=0.8)
    axes[1].set_xlabel("Normalized Position (0=first, 1=last)")
    axes[1].set_ylabel("Count")
    axes[1].set_title("Normalized Position")

    plt.suptitle(f"Author Position — {profile['name']}", fontsize=14)
    plt.tight_layout()
    plt.show()

## 7. Predict h-index

In [None]:
# ── 10-year h-index prediction (Acuna et al.) ────
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("always")
    prediction = predict_h_index(SCHOLAR_ID)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(prediction["years_ahead"], prediction["h_index"],
        marker="o", color="#E91E63", linewidth=2)
ax.axhline(y=profile["h_index"], color="gray", linestyle="--", alpha=0.5, label="Current")
ax.set_xlabel("Years Ahead")
ax.set_ylabel("Predicted h-index")
ax.set_title(f"h-index Prediction — {profile['name']}")
ax.legend()
plt.tight_layout()
plt.show()

prediction

## 8. Co-author Network

In [None]:
# ── Get co-author network ────────────────────────
print("⏳ Building co-author network (may take 30-60s)...")
network = get_coauthors(SCHOLAR_ID, n_coauthors=5, n_deep=1)
print(f"✅ {len(network)} co-author connections found")
network.head(15)

In [None]:
# ── Visualize network ────────────────────────────
if len(network) > 0:
    fig = plot_coauthors(network, size_labels=9, figsize=(14, 10))
    plt.show()

## 9. Export Results

In [None]:
# ── Save publications to CSV ─────────────────────
pubs.to_csv("publications.csv", index=False)
print("📁 Saved publications.csv")

# Download from Colab
from google.colab import files
files.download("publications.csv")

In [None]:
# ── Save all results to Excel (multi-sheet) ──────
with pd.ExcelWriter("scholar_report.xlsx") as writer:
    pubs.to_excel(writer, sheet_name="Publications", index=False)
    hist.to_excel(writer, sheet_name="Citation History", index=False)
    positions.to_excel(writer, sheet_name="Author Position", index=False)
    prediction.to_excel(writer, sheet_name="h-index Prediction", index=False)

print("📁 Saved scholar_report.xlsx")
files.download("scholar_report.xlsx")

## 10. NIW Petition Export

Generates `gs_summary.json` and `gs_papers.csv` in the exact format
consumed by the **NIW Petition Skill System** (`vera-niw-assemble`,
`vera-niw-evaluate`, `vera-niw-pillar`).

**What gets built:**
- `gs_summary.json` — profile metrics + papers array + notable citers
- `gs_papers.csv` — flat table with NIW-required columns

**Manual steps after export:**
1. Verify `indexed_in` / `indexed` for each paper (ISI, Scopus, PubMed)
2. Add `notable_citers` entries for government/industry citations (Prong 2 evidence)

In [None]:
# ── Build gs_summary.json ────────────────────────
import json as _json
from datetime import datetime as _dt

_current_year = _dt.now().year
_last_5 = set(range(_current_year - 4, _current_year + 1))

# ── Last-5-year citation count from citation history ──
cites_last_5 = int(hist[hist["year"].isin(_last_5)]["cites"].sum())

# ── Last-5-year h-index and i10-index from publications ──
_recent = pubs[pubs["year"].isin(_last_5)].copy()
_rc = sorted(_recent["cites"].dropna().astype(int).tolist(), reverse=True)
h_last_5 = 0
for _i, _c in enumerate(_rc, 1):
    if _c >= _i:
        h_last_5 = _i
    else:
        break
i10_last_5 = int((_recent["cites"] >= 10).sum())

# ── Merge first-author flag from positions analysis ──
_first_authored = set()
for idx, row in positions.iterrows():
    if pd.notna(row.get("position")) and row["position"] == 1:
        _first_authored.add(idx)

# ── Build papers array ──
_papers = []
for idx, row in pubs.iterrows():
    _authors_raw = str(row.get("author", ""))
    _co_authors = [a.strip() for a in _authors_raw.split(",") if a.strip()]
    _pubid = str(row.get("pubid", ""))
    _papers.append({
        "title": str(row.get("title", "")),
        "venue": str(row.get("journal", "")),
        "year": int(row["year"]) if pd.notna(row.get("year")) else None,
        "citation_count": int(row["cites"]) if pd.notna(row.get("cites")) else 0,
        "is_first_authored": idx in _first_authored,
        "co_authors": _co_authors,
        "scholar_url": f"https://scholar.google.com/citations?view_op=view_citation&hl=en&user={SCHOLAR_ID}&citation_for_view={SCHOLAR_ID}:{_pubid}",
        "indexed_in": []  # Manual: verify against ISI / Scopus / PubMed
    })

gs_summary = {
    "scholar_id": SCHOLAR_ID,
    "name": profile["name"],
    "total_citations": profile["total_cites"],
    "citations_last_5_years": cites_last_5,
    "h_index": profile["h_index"],
    "h_index_last_5_years": h_last_5,
    "i10_index": profile["i10_index"],
    "i10_index_last_5_years": i10_last_5,
    "papers": _papers,
    "notable_citers": []  # Manual: add government/industry citers for Prong 2
}

with open("gs_summary.json", "w") as _f:
    _json.dump(gs_summary, _f, indent=2)

_n_first = sum(1 for p in _papers if p["is_first_authored"])
print(f"gs_summary.json saved")
print(f"  Scholar:    {gs_summary['name']}")
print(f"  Citations:  {gs_summary['total_citations']:,} (last 5yr: {cites_last_5:,})")
print(f"  h-index:    {gs_summary['h_index']} (last 5yr: {h_last_5})")
print(f"  i10-index:  {gs_summary['i10_index']} (last 5yr: {i10_last_5})")
print(f"  Papers:     {len(_papers)} ({_n_first} first-authored)")

In [None]:
# ── Build gs_papers.csv ─────────────────────────
gs_papers = pubs[["title", "journal", "year", "cites"]].copy()
gs_papers = gs_papers.rename(columns={"journal": "venue", "cites": "citation_count"})
gs_papers["is_first_authored"] = [i in _first_authored for i in gs_papers.index]
gs_papers["indexed"] = ""  # Manual: verify against ISI / Scopus / PubMed
gs_papers = gs_papers.sort_values("citation_count", ascending=False).reset_index(drop=True)

gs_papers.to_csv("gs_papers.csv", index=False)
print(f"gs_papers.csv saved ({len(gs_papers)} papers)")
print(f"Columns: {list(gs_papers.columns)}")
print()
gs_papers.head(10)

In [None]:
# ── Download NIW files ───────────────────────────
from google.colab import files
files.download("gs_summary.json")
files.download("gs_papers.csv")

print("Downloaded gs_summary.json + gs_papers.csv")
print()
print("Before filing, manually verify:")
print("  1. indexed_in (JSON) / indexed (CSV)")
print("     Check each venue against ISI Web of Science, Scopus, PubMed")
print("  2. notable_citers in gs_summary.json")
print("     Add government/industry citations — these are Prong 2 evidence")
print("  3. Papers list completeness")
print("     GS may miss some publications; add any missing ones manually")

---

## ⚠️ Rate Limiting Notes

- Google Scholar will block you (HTTP 429) if you make too many requests
- `time.sleep(1)` is added between detail calls — don't remove it
- For bulk operations, add longer delays
- If blocked, wait 5-10 minutes and try again
- Consider `set_scholar_mirror()` for alternative endpoints