In [22]:
import os
from os import path as op
import time

import requests
from bs4 import BeautifulSoup
import pandas as pd

# selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

def get_request(url):
    while True:
        try:
            response = requests.get(url)
            if response.status_code == 404:
                return None
            response.raise_for_status()  # 如果响应状态不是200，引发HTTPError异常
            return response.text
        except requests.exceptions.RequestException as err:
            print(f"Network error occurred: {err}. Retrying...")
            time.sleep(1)  # Wait for 1 seconds before retrying

def get_authors(li):
    spans = li.find('cite').find_all('span', attrs={'itemprop': 'author'})
    authors = [span.text for span in spans]
    return ', '.join(authors)

def get_paper_title(li):
    title = li.find('span', attrs={'itemprop': 'name', 'class' : 'title'}).text
    if title[-1] == '.':
        title = title[:-1]
    return title

def get_journal_conf_dblp_abbr(li):
    return li['id'].split('/')[1]

def get_year(li):
    return li.find('span', attrs={'itemprop': 'datePublished'}).text

# text excluding children's text
def get_text_excluding_children(driver, element):
    return driver.execute_script("""
    return jQuery(arguments[0]).contents().filter(function() {
        return this.nodeType == Node.TEXT_NODE;
    }).text();
    """, element)

# EI
def get_ei_result(title, k=5, time_limit=60):
    title = title.strip()
    ev_url = 'https://www.engineeringvillage.com/search/quick.url' # Engineering Village search url
    # set options
    options = webdriver.EdgeOptions()
    # useragent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
    useragent = ''
    options.add_argument("user-agent:{}".format(useragent))
    options.add_argument('--headless')

    # webdriver
    driver = webdriver.Edge(options = options)
    driver.get(ev_url)

    # wait at most time_limit sec
    wait_t = 0
    waiting = True
    while waiting:
        try:
            search_box = driver.find_element(By.ID, "search-word-1")
            waiting = False
        except:
            if wait_t > time_limit-1:
                return None
            time.sleep(1)
            wait_t += 1
    time.sleep(1)
    # typing and searching simulation
    search_box.send_keys(title)
    search_box.send_keys(Keys.ENTER)

    # search result
    wait_t = 0
    waiting = True
    while waiting:
        try:
            search_list = driver.find_element(By.ID, "result_0")
            waiting = False
        except:
            if wait_t > time_limit-1:
                return None
            time.sleep(1)
            wait_t += 1
    time.sleep(1)
    search_titles = search_list.find_elements(By.XPATH, '//h3[@class="result-title"]')[:k]
    
    # check if the first k related search result is the input title
    # return is_ei
    is_ei = False
    for search_title in search_titles:
        doc_type = search_title.find_element(By.XPATH, '//div[@class="row db-doctype-info"]').find_element(By.TAG_NAME, 'div')
        if get_text_excluding_children(driver, doc_type).strip() == 'Preprint (PP)':
            # skip the PrePrint paper
            continue
        search_t_ele = search_titles[0].find_element(By.TAG_NAME, 'a')
        search_t = get_text_excluding_children(driver, search_t_ele).strip()
        if search_t == title:
            is_ei = True
            break
    driver.close()
    return is_ei

# SCI
def get_sci_result(title, k=5, time_limit=60):
    wos_base_url = 'https://www.webofscience.com'
    wos_searh_url = 'https://www.webofscience.com/wos/alldb/basic-search' 
	# set options
    options = webdriver.EdgeOptions()
    # useragent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
    useragent = ''
    options.add_argument("user-agent:{}".format(useragent))
    options.add_argument('--headless')


    # webdriver
    driver = webdriver.Edge(options = options)
    # navigate to search page
    driver.get(wos_searh_url)

    # wait at most time_limit sec
    wait_t = 0
    waiting = True
    while waiting:
        try:
            accept_cookies_btn = driver.find_element(By.XPATH, '//button[@id="onetrust-accept-btn-handler"]')
            search_box = driver.find_element(By.ID, "mat-input-0")
            search_btn = driver.find_element(By.XPATH, '//button[@data-ta="run-search"]')
            waiting = False
        except:
            if wait_t > time_limit-1:
                return None
            time.sleep(1)
            wait_t += 1
    time.sleep(1)
    accept_cookies_btn.click() # accept cookies (if do not do so, the search button will be intercepted)
    search_box.send_keys(title) # send title to search box
    search_btn.click() # click search
    
    # if url is not change : return False (no result in wos)
    time.sleep(1)
    if driver.current_url == 'https://webofscience.clarivate.cn/wos/alldb/basic-search':
        driver.close()
        return False

    driver.implicitly_wait(time_limit)
    close_tips = driver.find_element(By.XPATH, '//button[@aria-label="Close" and @id="pendo-close-guide-30f847dd"]')
    close_tips.click()

    # wait at most time_limit sec
    wait_t = 0
    waiting = True
    while waiting:
        try:         
            search_list = driver.find_elements(By.XPATH, '//a[@data-ta="summary-record-title-link"]')
            waiting = False
        except Exception as e:
            #print(e)
            #print('wait {}'.format(wait_t))
            if wait_t > time_limit-1:
                return None
            time.sleep(1)
            wait_t += 1
    time.sleep(1)

    # check if the first k related search result is the input title
    # return is_sci
    is_sci = False
    candidates = search_list[:k]
    for cand in candidates:
        if cand.text.strip() == title:
            is_sci = True
            break
  
    driver.close()
    return is_sci

In [13]:
urls_dblp = [
    'https://dblp.uni-trier.de/pid/59/1007.html', # hxn
    'https://dblp.uni-trier.de/pid/31/2864-10.html', # wx
    'https://dblp.uni-trier.de/pid/183/9198.html', # ffl
    'https://dblp.uni-trier.de/pid/63/1591-8.html', # ws
    'https://dblp.uni-trier.de/pid/96/1538.html', # hyb
    'https://dblp.uni-trier.de/pid/257/4945.html', # wjc
]
hxn_html = get_request('https://dblp.uni-trier.de/pid/59/1007.html')
soup = BeautifulSoup(hxn_html, 'html.parser')

In [14]:
'''paper_list_cell = soup.find_all('ul', attrs={'class': 'publ-list'})[0]
paper_list = []
for li in paper_list_cell.children:
    if len(li['class'])<=1 and li['class'][0] == 'year':
        if li.text == '2023':
            continue
        elif li.text != '2023':
            break
    if li['class'][1] == 'informal':
        continue
    print(li['class'])
    paper_list.append(li)'''
journal_paper_list = []
journal_paper_title_set = set()
conference_paper_list = []
conference_paper_title_set = set()
for url in urls_dblp:
    dblp_html = get_request(url)
    soup = BeautifulSoup(dblp_html, 'html.parser')
    paper_list_cell = soup.find_all('ul', attrs={'class': 'publ-list'})[0]
    for li in paper_list_cell.children:
        if len(li['class'])<=1 and li['class'][0] == 'year':
            if li.text == '2023':
                continue
            elif li.text != '2023':
                break
        if li['class'][1] == 'informal':
            continue
        elif li['class'][1] == 'article':
            title = get_paper_title(li)
            if title in journal_paper_title_set:
                continue
            journal_paper_title_set.add(title)
            journal_paper_list.append(li)
        elif li['class'][1] == 'inproceedings':
            title = get_paper_title(li)
            if title in conference_paper_title_set:
                continue
            conference_paper_title_set.add(title)
            conference_paper_list.append(li)


KeyboardInterrupt: 

In [None]:
paper_infos = []
for paper in journal_paper_list + conference_paper_list:
    aus = get_authors(paper)
    title = get_paper_title(paper)
    year = get_year(paper)
    abbr = get_journal_conf_dblp_abbr(paper)
    pub_type = 'journal' if paper['class'][1] == 'article' else 'conference'
    info = {
        'authors': aus,
        'title': title,
        'year': year,
        'abbr': abbr,
        'type': pub_type
    }
    paper_infos.append(info)

print(len(paper_infos))

74


In [None]:
conf_abbrs = set()
jour_abbrs = set()
for p_info in paper_infos:
    if p_info['type'] == 'journal':
        jour_abbrs.add(p_info['abbr'])
    else:
        conf_abbrs.add(p_info['abbr'])
print(conf_abbrs)
print(jour_abbrs)

{'icde', 'aaai', 'www', 'kdd', 'iclr', 'recsys', 'ijcai', 'sigir', 'cikm', 'cvpr', 'mm', 'wsdm', 'icse', 'acl', 'kddcd'}
{'pami', 'tkde', 'prl', 'tcss', 'tmm', 'tsmc', 'ijautcomp', 'tois', 'fcsc', 'tomccap'}


In [23]:
for p_info in paper_infos:
    title = p_info['title']
    if p_info.get('ei') is not None and p_info.get('sci') is not None:
        print(p_info)
        continue
    try:
        is_ei = get_ei_result(title, time_limit=30)
        is_sci = get_sci_result(title, time_limit=30)
        p_info.update({'ei': is_ei, 'sci': is_sci})
    except Exception as e:
        with open('running_exception.log' , 'a', encoding='utf-8') as f:
            print('-'*100, file=f)
            print('Error occurred with paper : {}'.format(p_info['title']), file=f)
            # print(e, file=f)
    print(p_info)


{'authors': 'Yuan Gao, Xiang Wang, Xiangnan He, Huamin Feng, Yong-Dong Zhang', 'title': 'Rumor detection with self-supervised learning on texts and social graph', 'year': '2023', 'abbr': 'fcsc', 'type': 'journal', 'ei': True, 'sci': True}
{'authors': 'Xiang Wang, Yingxin Wu, An Zhang, Fuli Feng, Xiangnan He, Tat-Seng Chua', 'title': 'Reinforced Causal Explainer for Graph Neural Networks', 'year': '2023', 'abbr': 'pami', 'type': 'journal', 'ei': True, 'sci': True}
{'authors': 'Kang Liu, Feng Xue, Xiangnan He, Dan Guo, Richang Hong', 'title': 'Joint Multi-Grained Popularity-Aware Graph Convolution Collaborative Filtering for Recommendation', 'year': '2023', 'abbr': 'tcss', 'type': 'journal', 'ei': True, 'sci': False}
{'authors': 'Jintang Li, Tao Xie, Liang Chen, Fenfang Xie, Xiangnan He, Zibin Zheng', 'title': 'Adversarial Attack on Large Scale Graph', 'year': '2023', 'abbr': 'tkde', 'type': 'journal', 'ei': True, 'sci': False}
{'authors': 'Fuli Feng, Xiangnan He, Hanwang Zhang, Tat-Seng

In [27]:
# add full name and level
import json
conf_ref = {}
jour_ref = {}
with open('conf_abbr2ref.json', 'r') as f:
    conf_ref = json.load(f)
with open('jour_abbr2ref.json', 'r') as f:
    jour_ref = json.load(f)

for p_info in paper_infos:
    abbr = p_info['abbr']
    p_type = p_info['type']
    
    if p_type == 'journal':
        level = jour_ref[abbr]['level']
        full_name = jour_ref[abbr]['full_name']
        level_save = '中科院{}区'.format(level)
    elif p_type == 'conference':
        level = conf_ref[abbr]['level']
        full_name = conf_ref[abbr]['full_name']
        level_save = '无' if level == 'No' else 'CCF {}类'.format(level)
    p_info.update({'level': level_save, 'full_name': full_name})


In [34]:
# save as pd
pd_data = {
    '序号': [],
    '作者': [],
    '文章名称': [],
    '期刊/会议名称': [],
    '类别': [],
    '是否SCI/EI': [],
    '年份': []
}
for i, p_info in enumerate(paper_infos):
    pd_data['序号'].append(i+1)
    pd_data['作者'].append(p_info['authors'])
    pd_data['文章名称'].append(p_info['title'])
    pd_data['期刊/会议名称'].append(p_info['full_name'])
    pd_data['类别'].append(p_info['level'])
    sci_ei = []
    if p_info['sci']:
        sci_ei.append('SCI')
    if p_info['ei']:
        sci_ei.append('EI')     
    pd_data['是否SCI/EI'].append('' if len(sci_ei) == 0 else ','.join(sci_ei))
    pd_data['年份'].append(p_info['year'])

for key in pd_data:
    print('{} : {}'.format(key, len(pd_data[key])))
df = pd.DataFrame(pd_data)
df.to_excel('out.xlsx', sheet_name='2023', index=False)
    

序号 : 74
作者 : 74
文章名称 : 74
期刊/会议名称 : 74
类别 : 74
是否SCI/EI : 74
年份 : 74


In [None]:
for paper in journal_paper_list:
    print(paper['class'])
    aus =  get_authors(paper)
    print(aus)
    title = get_paper_title(paper)
    print(title)
    year = get_year(paper)
    print(year)
    abbr = get_journal_conf_dblp_abbr(paper)
    print(abbr)

['entry', 'article', 'toc']
Yuan Gao, Xiang Wang, Xiangnan He, Huamin Feng, Yong-Dong Zhang
Rumor detection with self-supervised learning on texts and social graph
2023
fcsc
['entry', 'article', 'toc']
Xiang Wang, Yingxin Wu, An Zhang, Fuli Feng, Xiangnan He, Tat-Seng Chua
Reinforced Causal Explainer for Graph Neural Networks
2023
pami
['entry', 'article', 'toc']
Kang Liu, Feng Xue, Xiangnan He, Dan Guo, Richang Hong
Joint Multi-Grained Popularity-Aware Graph Convolution Collaborative Filtering for Recommendation
2023
tcss
['entry', 'article', 'toc']
Jintang Li, Tao Xie, Liang Chen, Fenfang Xie, Xiangnan He, Zibin Zheng
Adversarial Attack on Large Scale Graph
2023
tkde
['entry', 'article', 'toc']
Fuli Feng, Xiangnan He, Hanwang Zhang, Tat-Seng Chua
Cross-GCN: Enhancing Graph Convolutional Network with $k$k-Order Feature Interactions
2023
tkde
['entry', 'article', 'toc']
Yu Zheng, Chen Gao, Xiangnan He, Depeng Jin, Yong Li
Incorporating Price into Recommendation With Graph Convolutional

In [None]:
for paper in conference_paper_list:
    print(paper['class'])
    aus =  get_authors(paper)
    print(aus)
    title = get_paper_title(paper)
    print(title)
    year = get_year(paper)
    print(year)
    abbr = get_journal_conf_dblp_abbr(paper)
    print(abbr)

['entry', 'inproceedings', 'toc']
Changyi Xiao, Xiangnan He, Yixin Cao
Knowledge Graph Embedding by Normalizing Flows
2023
aaai
['entry', 'inproceedings', 'toc']
Xun Deng, Wenjie Wang, Fuli Feng, Hanwang Zhang, Xiangnan He, Yong Liao
Counterfactual Active Learning for Out-of-Distribution Generalization
2023
acl
['entry', 'inproceedings', 'toc']
Wenjie Wang, Yong Liu, Yang Zhang, Weiwen Liu, Fuli Feng, Xiangnan He, Aixin Sun
The 1st Workshop on Recommendation with Generative Models
2023
cikm
['entry', 'inproceedings', 'toc']
Zhicai Wang, Yanbin Hao, Tingting Mu, Ouxiang Li, Shuo Wang, Xiangnan He
Bi-Directional Distribution Alignment for Transductive Zero-Shot Learning
2023
cvpr
['entry', 'inproceedings', 'toc']
Meng Jiang, Yang Zhang, Yuan Gao, Yansong Wang, Fuli Feng, Xiangnan He
LightMIRM: Light Meta-learned Invariant Risk Minimization for Trustworthy Loan Default Prediction
2023
icde
['entry', 'inproceedings', 'toc']
Yang Liu, Liang Chen, Xiangnan He, Jiaying Peng, Zibin Zheng, Jie 