# 根据已经获得的诺奖得主的ID和姓名对应关系，找出其合作者信息

## （1）导入需要的库

In [1]:
import json
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



## （2）读取文件

In [2]:
with open('authors_disambiguation_id2doi_addDOI.json', 'r', encoding='utf-8-sig') as fr_1:
    authors_disambiguation_dict = json.load(fr_1)

with open('allJournals_doi2authorsName.json', 'r', encoding='utf-8-sig') as fr_2:
    allPaper_doi2authorsName_dict = json.load(fr_2)

## （3）创建函数

#### 判断两个姓名的姓、名的首字母是否一致

In [5]:
def judge_sameCapital_funct(given_name, choice_name):
    
    bool_char = False
    # 分割符为"," "." " "，且其后可以跟着任意个空白字符" "
    pattern = r'[,.\s]\s*'
    given = re.split(pattern, given_name)
    choice = re.split(pattern, choice_name)
    given_spl = [e for e in given if (e!='') & (e!='jr')]
    choice_spl = [e for e in choice if (e!='') & (e!='jr')]
    # 获取两个姓名的长度
    given_len = len(given_spl)
    choice_len = len(choice_spl)
    
    # 获取两个姓名分别的firstname、lastname 
    given_first = given_spl[0]
    given_last = given_spl[-1]
    choice_first = choice_spl[0]
    choice_last = choice_spl[-1]
    given_cap = [e[0] for e in given_spl]
    choice_cap = [e[0] for e in choice_spl]
    
    ## 判断分割后两个列表中的元素是否相等
    if given_spl == choice_spl:
        bool_char = True
    elif given_len == choice_len: 
        if (given_cap == choice_cap) & (given_last == choice_last):
            bool_char = True
        else:
            bool_char = False
    else:
        if (given_first==choice_first) & (given_last==choice_last):
            bool_char = True
        elif (given_first[0]==choice_first[0]) & (given_last==choice_last):
            bool_char = True
        else:
            bool_char = False
            
    return bool_char

#### 利用fuzzywuzzy算法，找出去重文件中，与所给姓名最相似的姓名及其ID

In [6]:
def fuzzy_match_func(given_name, choices_authorDict_list):
    # 存储待选者ID和相似度的字典{ID：相似度}
    id2score_dict = {}
    
    # 遍历所有待选作者
    for dict_i in choices_authorDict_list:
        name_i = dict_i['author_name']
        id_i = dict_i['author_id']
        score_i = fuzz.ratio(given_name, name_i)
        id2score_dict[id_i] = score_i
        
    # 遍历最大值
    max_id = max(id2score_dict, key=lambda k: id2score_dict[k])
        
    return max_id

#### 根据给的论文doi和该论文的作者姓名列表，获得该论文的作者ID列表

In [7]:
def author_name2id_func(paper_doi, authors_name_list):
    
    authors_id_list = []
    
    # 将输入的合著者姓名列表中的姓名全部转换为小写
    authors_name_list = [e.lower() for e in authors_name_list]
    
    # 遍历每个作者
    for name in authors_name_list:
        choices_list = []
        for key, value in authors_disambiguation_dict.items():
            author_id = key
            author_name = value['author_name']
            doi_list = value['files_doi']
           
            # 判断是否在论文列表中
            if paper_doi in doi_list:
                if name == author_name:
                    authors_id_list.append(author_id)
                    choices_list = []
                    break    
                elif judge_sameCapital_funct(name, author_name):
                    choices_list.append(value)
        
        # 列表不为空
        if choices_list:
            mostMatch_id = fuzzy_match_func(name, choices_list)
            authors_id_list.append(mostMatch_id)
                    
    return authors_id_list

#### 获取某作者相关论文的所有合作者ID列表

In [None]:
def authorId_to_collasId_func(author_id_i, authorInfo_dict_i, name_parameter, dict_parameter=None):
    collasId_list = []
    authorName_json_i = authorInfo_dict_i[name_parameter].lower()
    
    # 取每个作者对应的论文doi列表
    try:
        doi_list_i = authorInfo_dict_i[dict_parameter]
    except:
        print("Wrong with author {0} and his information <{1}>".format(author_id_i, dict_parameter))

    for doi_j in doi_list_i:
        collasName_list_j = [e.lower() for e in allPaper_doi2authorsName_dict[doi_j]]    
        collasName_list_j = [e for e in collasName_list_j if e!=authorName_json_i]
        # 如果该篇论文的作者列表不为空
        if collasName_list_j:
            authorsId_list_j = author_name2id_func(doi_j, collasName_list_j)
            collasId_list.extend(authorsId_list_j)
    collasId_unique_list = list(set(collasId_list))
        
    return collasId_unique_list