# üìö Scholar.py ‚Äî Google Scholar Data Extraction

Upload `scholar.py` to Colab, then run these cells.

**Features:**
- Profile & citation metrics
- Publication lists with pagination
- Citation history (scholar-level & per-article)
- Compare multiple scholars
- Predict future h-index
- Co-author network analysis & visualization

## 0. Setup

In [None]:
# Install dependencies
#!pip install requests beautifulsoup4 pandas numpy networkx matplotlib -q

In [None]:
# Upload scholar.py ‚Äî run this cell and select the file
from google.colab import files
uploaded = files.upload()  # select scholar.py

Saving scholar.py to scholar (1).py


In [None]:
# Import everything
from scholar import *
import pandas as pd
import matplotlib.pyplot as plt
import time

print("‚úÖ scholar.py loaded successfully")

‚úÖ scholar.py loaded successfully


## 1. Set Your Scholar ID

Find your Google Scholar ID from your profile URL:  
`https://scholar.google.com/citations?user=XXXXXXX` ‚Üí the `XXXXXXX` part is your ID.

In [None]:
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# üîß CHANGE THIS to your own Scholar ID
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
SCHOLAR_ID = "XXXXXXX"


## 2. Profile & Metrics

In [None]:
# ‚îÄ‚îÄ Quick summary ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(scholar_summary(SCHOLAR_ID))

In [None]:
# ‚îÄ‚îÄ Full profile dict ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
profile = get_profile(SCHOLAR_ID)

for k, v in profile.items():
    if k != "coauthors":
        print(f"  {k:16s}: {v}")
print(f"  {'coauthors':16s}: {len(profile['coauthors'])} listed")

In [None]:
# ‚îÄ‚îÄ Profile coauthors table ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if profile["coauthors"]:
    pd.DataFrame(profile["coauthors"])

## 3. Publications

In [None]:
# ‚îÄ‚îÄ Get all publications ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
pubs = get_publications(SCHOLAR_ID)
print(f"Total publications: {len(pubs)}")
pubs.head(15)

In [None]:
# ‚îÄ‚îÄ Sort by year ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
pubs_by_year = get_publications(SCHOLAR_ID, sortby="year")
pubs_by_year[["title", "year", "cites"]].head(10)

In [None]:
# ‚îÄ‚îÄ Quick stats ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f"Number of articles:          {get_num_articles(SCHOLAR_ID)}")
print(f"Distinct journals:           {get_num_distinct_journals(SCHOLAR_ID)}")
print(f"Oldest article year:         {get_oldest_article(SCHOLAR_ID)}")
print(f"Publications in top journals: {get_num_top_journals(SCHOLAR_ID)}")

In [None]:
# ‚îÄ‚îÄ Top cited publications ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
top = pubs.nlargest(10, "cites")[["title", "journal", "cites", "year"]]
top.style.bar(subset=["cites"], color="#4CAF50")

## 4. Publication Details

In [None]:
# ‚îÄ‚îÄ Pick the most-cited publication ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
top_pub = pubs.nlargest(1, "cites").iloc[0]
pub_id = top_pub["pubid"]
print(f"Title: {top_pub['title']}")
print(f"Pub ID: {pub_id}")
print(f"Scholar URL: {get_article_scholar_url(SCHOLAR_ID, pub_id)}")

time.sleep(1)  # rate limit courtesy

In [None]:
# ‚îÄ‚îÄ Abstract ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
abstract = get_publication_abstract(SCHOLAR_ID, pub_id)
print("Abstract:")
print(abstract if abstract else "(not available)")

time.sleep(1)

In [None]:
# ‚îÄ‚îÄ Full metadata ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
meta = get_publication_data_extended(SCHOLAR_ID, pub_id)
for k, v in meta.items():
    print(f"  {k}: {v}")

time.sleep(1)

In [None]:
# ‚îÄ‚îÄ Publication date & URL ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print(f"Publication date: {get_publication_date(SCHOLAR_ID, pub_id)}")
time.sleep(1)
print(f"Full-text URL:    {get_publication_url(SCHOLAR_ID, pub_id)}")

In [None]:
# ‚îÄ‚îÄ Complete author list (with initials) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
authors = get_complete_authors(SCHOLAR_ID, pub_id)
print(f"Authors: {authors}")

## 5. Citation History

In [None]:
# ‚îÄ‚îÄ Scholar-level citation history ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
hist = get_citation_history(SCHOLAR_ID)

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(hist["year"], hist["cites"], color="#2196F3", edgecolor="white")
ax.set_xlabel("Year")
ax.set_ylabel("Citations")
ax.set_title(f"Annual Citations ‚Äî {profile['name']}")
plt.tight_layout()
plt.show()
hist

In [None]:
# ‚îÄ‚îÄ Per-article citation history ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
time.sleep(1)
article_hist = get_article_cite_history(SCHOLAR_ID, pub_id)

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(article_hist["year"], article_hist["cites"], color="#FF9800", edgecolor="white")
ax.set_xlabel("Year")
ax.set_ylabel("Citations")
ax.set_title(f"Citation History ‚Äî {top_pub['title'][:60]}...")
plt.tight_layout()
plt.show()
article_hist

## 6. Author Position Analysis

In [None]:
# ‚îÄ‚îÄ Where does the author appear in author lists? ‚îÄ
positions = author_position(pubs["author"], profile["name"])

# Summary
valid = positions.dropna(subset=["position"])
print(f"Found in {len(valid)}/{len(positions)} author lists")
if len(valid) > 0:
    print(f"First author:  {(valid['position'] == 1).sum()} times")
    print(f"Last author:   {(valid['position'] == valid['n_authors']).sum()} times")
    print(f"Mean position: {valid['position_normalized'].mean():.2f} (0=first, 1=last)")

positions.head(10)

In [None]:
# ‚îÄ‚îÄ Visualize position distribution ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if len(valid) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    axes[0].hist(valid["position"].dropna(), bins=range(1, int(valid["position"].max()) + 2),
                 color="#9C27B0", edgecolor="white", alpha=0.8)
    axes[0].set_xlabel("Author Position")
    axes[0].set_ylabel("Count")
    axes[0].set_title("Absolute Position")

    axes[1].hist(valid["position_normalized"].dropna(), bins=20,
                 color="#009688", edgecolor="white", alpha=0.8)
    axes[1].set_xlabel("Normalized Position (0=first, 1=last)")
    axes[1].set_ylabel("Count")
    axes[1].set_title("Normalized Position")

    plt.suptitle(f"Author Position ‚Äî {profile['name']}", fontsize=14)
    plt.tight_layout()
    plt.show()

## 7. Predict h-index

In [None]:
# ‚îÄ‚îÄ 10-year h-index prediction (Acuna et al.) ‚îÄ‚îÄ‚îÄ‚îÄ
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("always")
    prediction = predict_h_index(SCHOLAR_ID)

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(prediction["years_ahead"], prediction["h_index"],
        marker="o", color="#E91E63", linewidth=2)
ax.axhline(y=profile["h_index"], color="gray", linestyle="--", alpha=0.5, label="Current")
ax.set_xlabel("Years Ahead")
ax.set_ylabel("Predicted h-index")
ax.set_title(f"h-index Prediction ‚Äî {profile['name']}")
ax.legend()
plt.tight_layout()
plt.show()

prediction

## 8. Export Results

In [None]:
# ‚îÄ‚îÄ Save publications to CSV ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
pubs.to_csv("publications.csv", index=False)
print("üìÅ Saved publications.csv")

# Download from Colab
from google.colab import files
files.download("publications.csv")

In [None]:
# ‚îÄ‚îÄ Save all results to Excel (multi-sheet) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
with pd.ExcelWriter("scholar_report.xlsx") as writer:
    pubs.to_excel(writer, sheet_name="Publications", index=False)
    hist.to_excel(writer, sheet_name="Citation History", index=False)
    positions.to_excel(writer, sheet_name="Author Position", index=False)
    prediction.to_excel(writer, sheet_name="h-index Prediction", index=False)

print("üìÅ Saved scholar_report.xlsx")
files.download("scholar_report.xlsx")

---

## ‚ö†Ô∏è Rate Limiting Notes

- Google Scholar will block you (HTTP 429) if you make too many requests
- `time.sleep(1)` is added between detail calls ‚Äî don't remove it
- For bulk operations, add longer delays
- If blocked, wait 5-10 minutes and try again
- Consider `set_scholar_mirror()` for alternative endpoints