# 数据获取部分教程

### 任务要求：
 * 创建一个名为`hello_world`的文献集
 * 分别在IEEE以"good evening"为检索词，在Scopus以"good morning"为检索词，获取检索结果前5条（两个数据源共10条）文献元数据，加入`hello_world`文献集
 * 获取这10篇文献的原文，保存在`./data/stay_home`目录下，并将每篇文献的原文地址记录在元数据中。

In [1]:
# 进行初始化，创建一些基础对象
import os
os.chdir('D:\\大三下\\a\\系统\\Project-KnowNet')
from data_fetcher.id_manager import IDManager
from data_platform.config import ConfigManager
from data_platform.datasource.mongodb import MongoDBDS
from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO)

current_location = Path(os.getcwd())
config = ConfigManager({
    'init':{
        'uri': None,
        'database': 'db'
    }
})

mgdbds = MongoDBDS(config=config)
# 先将数据库原有内容清空
mgdbds.clear()

In [2]:
# 创建一个ID管理器，用于给文献赋ID
pim = IDManager(
    config=config, 
    key=('paper_id', 'title'), 
    auto_inc = ('id_inc', 'paper_id')
)

print(mgdbds.get_db().list_collection_names())
print(list(mgdbds.get_db()['id_inc'].find()))

['id_inc']
[{'_id': 'paper_id', 'sequence_value': 0}]


In [4]:
# 先爬IEEE
from data_fetcher.ieee.ieee_retrieval import IEEERetrieval
ir = IEEERetrieval(
    query = 'good evening',
    offset = 0,
    num_result = 5, 
    paper_id_manager=pim,
    paper_set='hello_world'
)
ir_res = ir.retrieve()     # 执行检索
ir.save(mgdbds)         # 将检索结果记录在数据库中

crawling metadata page: 0 / 1
finished crawling metadata pages.
and get metadata of 100 papers
INFO:root:adding paper with id 0 in paper_set hello_world
INFO:root:paper_set hello_world not found, creating a new one.
INFO:root:adding paper with id 1 in paper_set hello_world
INFO:root:adding paper with id 2 in paper_set hello_world
INFO:root:adding paper with id 3 in paper_set hello_world
INFO:root:adding paper with id 4 in paper_set hello_world


In [6]:
from data_fetcher.ieee.ieee_fulltext_spider import IEEEFulltextSpider

article_numbers = [item['IEEEArticleNumber'] for item in ir_res]

for article_number in article_numbers:
    # 请注意：一定要在校园网环境下爬才能成功！
    ifs = IEEEFulltextSpider(
        article_number=article_number,
        request_interval=5
    )
    ifs_result = ifs.execute() # 爬取PDF，记录爬取结果所在路径
    # 当然并不是所有原文都能成功爬到的，爬不到就会输出ERROR的log
    
    # 更新数据库中对应元数据的uri字段。
    # 可以整合到FullTextSpider类中，但这样会增加耦合，所以我还在思考
    if ifs_result:
        mgdbds.query_and_update_doc(
            docset='metadata',
            query={'IEEEArticleNumber': article_number},
            val={'$set': {'uri': ifs_result}}
        )
    # 另外以后可以考虑改成多线程，这样爬IEEE的时候还能继续运行后面的程序

# 此时./data中应该已经有几篇pdf了。
# 检查现在数据库中的内容
print(mgdbds.get_db().list_collection_names())
print(list(mgdbds.get_db()['paper_id'].find()))
print(list(mgdbds.get_db()['paper_set'].find()))
print(list(mgdbds.get_db()['metadata'].find())[-1]) # 注意uri字段
print(list(mgdbds.get_db()['id_inc'].find()))

ERROR:root:IEEEFulltextSpider, articleNumber = 4091621 PDF URL not found. Exception: list index out of range
['metadata', 'paper_set', 'paper_id', 'id_inc']
[{'_id': 0, 'title': 'Diurnal Variations of Atmospheric Noise in the Evening Transition Period'}, {'_id': 1, 'title': 'Towards Miss Universe automatic prediction: The evening gown competition'}, {'_id': 2, 'title': 'Circuits evening panel discussion 1: Is university circuit design research and education keeping up with industry needs?'}, {'_id': 3, 'title': 'Technology/circuits joint evening panel discussion semiconductor industry in 2020: Evolution or revolution?'}, {'_id': 4, 'title': 'Technology / circuits joint evening panel discussion semiconductor industry in 2020: Evolution or revolution? Tuesday, June 16, 20:00–22:00'}, {'_id': 5, 'title': 'Short-term FFBS demand prediction with multi-source data in a hybrid deep learning framework'}, {'_id': 6, 'title': 'BBC World Service: schedule operation and the new Bush House control 

In [7]:
# 再爬Scopus
from data_fetcher.scopus.scopus_retrieval import ScopusRetrieval

# 爬之前要先在./data_fetcher/scopus路径下设置config文件，详见./data_fetcher/README.md
sr = ScopusRetrieval(query='good morning', num_result=5)  # 初始化Scopus检索接口类
sr.retrieve()       # 执行检索
sr_doi_list = sr.get_doi_list()     # 获取检索结果中的doi，方便后续爬元数据和全文
print(sr_doi_list)

INFO:data_fetcher.dependencies.elsapy.elsclient:Module loaded.
INFO:data_fetcher.dependencies.elsapy.elssearch:Module loaded.
INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/search/scopus?query=good%20morning
Number of results got with query good%20morning : 25
['10.1080/20008198.2020.1723857', '10.1038/s41598-020-61386-4', '10.1038/s41533-020-0163-5', '10.1038/s41598-020-57976-x', '10.1038/s41598-020-57661-z']


In [10]:
from data_fetcher.scopus.scopus_metadata_spider import ScopusMetadataSpider
from data_fetcher.scopus.scopus_fulltext_spider import ScopusFulltextSpider

for sr_doi in sr_doi_list:
    # 爬元数据
    sms = ScopusMetadataSpider(
        doi=sr_doi,
        paper_id_manager=pim,
        paper_set='hello_world'
    )
    sms_result = sms.execute()
    sms.save(mgdbds)

    # 爬全文，更新元数据
    # 有的文章可能没收录全文。
    # 请注意：一定要在校园网环境下爬才能成功！
    # 当然并不是所有原文都能成功爬到的，爬不到会返回表示错误的json: 
    # {"status":{"statusCode":"RESOURCE_NOT_FOUND","statusText":"The resource specified cannot be found."}
    sfs = ScopusFulltextSpider(doi=sr_doi) 
    sfs_result = sfs.execute()

    if sfs_result:
         mgdbds.query_and_update_doc(
            docset='metadata',
            query={'doi': sr_doi},
            val={'$set': {'uri': sfs_result}}
        )

INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/abstract/doi/10.1080/20008198.2020.1723857
INFO:data_fetcher.dependencies.elsapy.elsentity:Data loaded for https://api.elsevier.com/content/abstract/doi/10.1080/20008198.2020.1723857
INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/article/doi/10.1080/20008198.2020.1723857
and using headers {'X-ELS-APIKey': '4719fe9f53c1bc699307a4f4c4ccf988', 'User-Agent': 'elsapy-v0.4.6', 'Accept': 'application/json'}:
{"service-error":{"status":{"statusCode":"RESOURCE_NOT_FOUND","statusText":"The resource specified cannot be found."}}}
ERROR:root:ScopusFulltextSpider, getting doi = 10.1080/20008198.2020.1723857failed. Exception: ['HTTP 404 Error from https://api.elsevier.com/content/article/doi/10.1080/20008198.2020.1723857\nand using headers {\'X-ELS-APIKey\': \'4719fe9f53c1bc699307a4f4c4ccf988\', \'User-Agent\': \'elsapy-v0.4.6\', \'Accept\':

In [11]:
# 最终检查数据库内容
print(mgdbds.get_db().list_collection_names())
print(list(mgdbds.get_db()['id_inc'].find()))
print(list(mgdbds.get_db()['paper_set'].find()))
# 都打印出来太长了，看个总数就行
print(len(list(mgdbds.get_db()['paper_id'].find())))
print(len(list(mgdbds.get_db()['metadata'].find())))

['metadata', 'paper_set', 'paper_id', 'id_inc']
[{'_id': 'paper_id', 'sequence_value': 372}]
[{'_id': ObjectId('5ea275c469801f65e85522fb'), 'set_name': 'hello_world', 'paper': [0, 1, 2, 3, 4, 158, 193, 269, 334, 371]}]
372
10


In [12]:
# 假设我们还要把good morning检索到的元数据添加到另一个"hello_python" paper_set中。
# 爬虫是无需知道这些元数据是否已经在metadata集合或某个paper_set中出现过的（当然以后可以加上查重功能）
# 因此会先爬一遍，再保存到metadata集合，再加入到"hello_python" paper_set。

from data_fetcher.scopus.scopus_metadata_spider import ScopusMetadataSpider
from data_fetcher.scopus.scopus_fulltext_spider import ScopusFulltextSpider

for sr_doi in sr_doi_list:
    # 爬元数据
    sms = ScopusMetadataSpider(
        doi=sr_doi,
        paper_id_manager=pim,
        paper_set='hello_python'
    )
    sms_result = sms.execute()
    sms.save(mgdbds)

INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/abstract/doi/10.1080/20008198.2020.1723857
INFO:data_fetcher.dependencies.elsapy.elsentity:Data loaded for https://api.elsevier.com/content/abstract/doi/10.1080/20008198.2020.1723857
INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/abstract/doi/10.1038/s41598-020-61386-4
INFO:data_fetcher.dependencies.elsapy.elsentity:Data loaded for https://api.elsevier.com/content/abstract/doi/10.1038/s41598-020-61386-4
INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/abstract/doi/10.1038/s41533-020-0163-5
INFO:data_fetcher.dependencies.elsapy.elsentity:Data loaded for https://api.elsevier.com/content/abstract/doi/10.1038/s41533-020-0163-5
INFO:data_fetcher.dependencies.elsapy.elsclient:Sending GET request to https://api.elsevier.com/content/abstract/doi/10.1038/s41598-020-57976-x
INFO:data

In [13]:
# 最终检查数据库内容
print(mgdbds.get_db().list_collection_names())
print(list(mgdbds.get_db()['id_inc'].find()))
print(list(mgdbds.get_db()['paper_set'].find()))
# 都打印出来太长了，看个总数就行
print(len(list(mgdbds.get_db()['paper_id'].find())))
print(len(list(mgdbds.get_db()['metadata'].find())))

['metadata', 'paper_set', 'paper_id', 'id_inc']
[{'_id': 'paper_id', 'sequence_value': 372}]
[{'_id': ObjectId('5ea275c469801f65e85522fb'), 'set_name': 'hello_world', 'paper': [0, 1, 2, 3, 4, 158, 193, 269, 334, 371]}, {'_id': ObjectId('5ea2787069801f65e85522fc'), 'set_name': 'hello_python', 'paper': [158, 193, 269, 334, 371]}]
372
10
