In [30]:
from googlesearch import search

def search_google_scholar_url(name, university, max_results=10):
    query = f"{name} {university} google scholar"
    print(f"🔍 Searching: {query}")
    results = list(search(query, num_results=max_results))
    for url in results:
        if "scholar.google.com/citations?" in url:
            print(f"✅ Found Google Scholar profile: {url}")
            return url
    return None

In [31]:
import requests
from bs4 import BeautifulSoup

def fetch_scholar_profile(url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"❌ Failed to fetch page: {e}")
        return None

In [32]:
def parse_scholar_profile(soup):
    if soup is None:
        return None

    data = {}

    # Name & affiliation
    data['name'] = soup.find('div', id='gsc_prf_in').text.strip() if soup.find('div', id='gsc_prf_in') else "Unknown"
    data['affiliation'] = soup.find('div', class_='gsc_prf_il').text.strip() if soup.find('div', class_='gsc_prf_il') else "N/A"

    # Metrics
    metrics = soup.find_all('td', class_='gsc_rsb_std')
    data['citations'] = metrics[0].text if len(metrics) >= 1 else "N/A"
    data['h_index'] = metrics[2].text if len(metrics) >= 3 else "N/A"

    # Publications
    articles = soup.find_all('tr', class_='gsc_a_tr')[:5]
    publications = []
    for article in articles:
        title_tag = article.find('a', class_='gsc_a_at')
        title = title_tag.text.strip() if title_tag else "Untitled"
        link = f"https://scholar.google.com{title_tag['href']}" if title_tag else ""
        authors_tag = article.find('div', class_='gs_gray')
        authors = authors_tag.text.strip() if authors_tag else ""

        # ✅ Extract year
        year_tag = article.find('span', class_='gsc_a_h')
        year = int(year_tag.text.strip()) if year_tag and year_tag.text.strip().isdigit() else None

        publications.append({
            'title': title,
            'link': link,
            'authors': authors,
            'year': year  # ✅ Include year in the publication dict
        })

    data['publications'] = publications
    return data

In [33]:
def format_as_markdown(profile):
    if profile is None:
        return "❌ Could not parse Google Scholar profile."

    md = f"# {profile['name']}\n"
    md += f"**Affiliation**: {profile['affiliation']}\n\n"
    md += f"- **Citations**: {profile['citations']}\n"
    md += f"- **h-index**: {profile['h_index']}\n\n"
    md += "## Recent Publications\n"
    for pub in profile['publications']:
        md += f"- [{pub['title']}]({pub['link']})"
        if pub.get('year'):
            md += f" ({pub['year']})"
        md += "\n"
        md += f"  - _{pub['authors']}_\n"
    return md


In [34]:
from urllib.parse import urlparse, parse_qs, urlencode

def transform_scholar_url(original_url):
    parsed = urlparse(original_url)
    query_params = parse_qs(parsed.query)

    # Add or overwrite required parameters
    query_params['view_op'] = ['list_works']
    query_params['sortby'] = ['pubdate']

    # Flatten query string (each param should be a string, not list)
    flat_query = {k: v[0] for k, v in query_params.items()}

    new_query = urlencode(flat_query)
    transformed_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{new_query}"
    
    return transformed_url


In [35]:
def get_scholar_markdown(name, university):
    url_og = search_google_scholar_url(name, university)
    if not url_og:
        return None

    url = transform_scholar_url(url_og)
    soup = fetch_scholar_profile(url)
    profile_data = parse_scholar_profile(soup)

    # ✅ Check if most recent paper is from 2023 or later
    years = [pub['year'] for pub in profile_data['publications'] if isinstance(pub['year'], int)]
    if not years or max(years) < 2023:
        return None

    return format_as_markdown(profile_data)



In [36]:
# Example input
name = "Rafiou Agoro"
university = "Tufts"

markdown = get_scholar_markdown(name, university)
print("\n--- Google Scholar Markdown ---\n")
print(markdown)

🔍 Searching: Rafiou Agoro Tufts google scholar
✅ Found Google Scholar profile: https://scholar.google.com/citations?user=m9-daTcAAAAJ&hl=en

--- Google Scholar Markdown ---

# Rafiou Agoro
**Affiliation**: The Jackson Laboratory

- **Citations**: 745
- **h-index**: 14

## Recent Publications
- [Dynamic single cell transcriptomics defines kidney FGF23/KL bioactivity and novel segment-specific inflammatory targets.](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=m9-daTcAAAAJ&sortby=pubdate&citation_for_view=m9-daTcAAAAJ:aqlVkmm33-oC) (2025)
  - _R Agoro, J Myslinski, YG Marambio, D Janosevic, KN Jennings, S Liu, ..._
- [Phosphorus-independent role of FGF23 in erythropoiesis and iron homeostasis](https://scholar.google.com/citations?view_op=view_citation&hl=en&user=m9-daTcAAAAJ&sortby=pubdate&citation_for_view=m9-daTcAAAAJ:9ZlFYXVOiuMC) (2024)
  - _MY Park, R Agoro, SS Jankauskas, C Le Henaff, D Sitara_
- [Challenges and opportunities for conceiving genetically dive

In [37]:
import re

def extract_titles(markdown: str) -> str:
    # Find all titles in square brackets
    titles = re.findall(r'\[(.*?)\]', markdown)
    # Join them into one string separated by periods or semicolons
    return '. '.join(titles)

titles = extract_titles(markdown)

In [38]:
import re

def extract_first_paper_info(markdown: str):
    # Match the first markdown-style link: [Title](URL)
    match = re.search(r'\[(.*?)\]\((.*?)\)', markdown)
    if match:
        title, link = match.group(1), match.group(2)
        return title, link
    else:
        return None, None