In [1]:
from datetime import datetime
import requests
from rich import print
from bs4 import BeautifulSoup
from markdown_table_generator import (
    generate_markdown, table_from_string_list, Alignment
)

from jinja2 import Environment, FileSystemLoader

In [2]:
def crawl_paper_info(user_id):
    profile_url = f'https://scholar.google.com/citations?user={user_id}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(profile_url, headers=headers)

    if response.status_code != 200:
        raise Exception('Failed to retrieve the page.')

    html_code = response.text

    # Parse HTML.
    soup = BeautifulSoup(html_code, 'html.parser')

    # Select tag <tr> containing paper information.
    tr_tags = soup.find_all('tr', class_='gsc_a_tr')

    # Extract information for each <tr> tag.
    info = []
    for tr_tag in tr_tags:
        title = tr_tag.find('a', class_='gsc_a_at').text
        authors = tr_tag.find('div', class_='gs_gray').text
        journal_info = tr_tag.find_all('div', class_='gs_gray')[1].text
        citations = tr_tag.find('a', class_='gsc_a_ac').text
        year = tr_tag.find('span', class_='gsc_a_hc').text

        info.append({
            'title': title,
            'authors': authors,
            'journal_info': journal_info,
            'citations': citations,
            'year': year,
        })
    
    return info

In [3]:
papers = crawl_paper_info('-TInzSAAAAAJ')
print(papers)

In [4]:
main_papers = []
other_papers = []
for p in papers:
    if p['title'].startswith('*') or p['title'].startswith('(*'):
        main_papers.append(p)
    else:
        other_papers.append(p)
        
print(main_papers)
print('-' * 80)
print(other_papers)

In [5]:
def make_paper_text(paper):
    title = paper['title'].replace('(*', '(').replace('*', '')
    authors = paper['authors'].replace(', ...', ' et al.')
    
    if len(authors.split(',')) > 6:
        authors = authors.split(',')[:3]
        # authors.append('et al.')
        authors = ', '.join(authors) + ' et al.'
    
    return f"{title}, {authors}, *{paper['journal_info']}*"

In [6]:
rows = [['Title', 'Citations', 'Year']]
for paper in main_papers:
    row = [
        make_paper_text(paper),
        paper['citations'],
        paper['year'],
    ]

    row = [str(c) for c in row]
    print(row)

    rows.append(row)

table1 = table_from_string_list(rows, Alignment.LEFT)
markdown1 = generate_markdown(table1)
print(markdown1)


rows = [['Title', 'Citations', 'Year']]
for paper in other_papers:
    row = [
        make_paper_text(paper),
        paper['citations'],
        paper['year'],
    ]

    row = [str(c) for c in row]
    print(row)

    rows.append(row)

table2 = table_from_string_list(rows, Alignment.LEFT)
markdown2 = generate_markdown(table2)
print(markdown2)

In [34]:
# Get today's date
today = datetime.now()

# Format the date as "Month Day, Year"
formatted_date = today.strftime("%b %d, %Y")

print(formatted_date)

In [39]:
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template('index_template.md')

with open('index.md', 'w', encoding='utf-8') as f:
    f.write(template.render(
        table1=markdown1,
        table2=markdown2,
        today=formatted_date,
        L1='<span style="color:#0F52BA;"><b>o</b></span>',
        L2='<span style="color:#0F52BA;"><b>oo</b></span>',
        L3='<span style="color:#0F52BA;"><b>ooo</b></span>',
    ))

In [40]:
!pandoc index.md -s --css=github-pandoc.css --metadata title="Sangwon Lee" -o index.html  

In [43]:
import shutil

shutil.copy('index.html', 'Curriculum Vitae - Sangwon Lee.html')

'Curriculum Vitae - Sangwon Lee.html'