In [13]:
import pandas as pd
import unicodedata
import json
from collections import Counter,defaultdict

## Datos

In [2]:
autores = pd.read_csv("Authors.csv")
autores.sample(5)

Unnamed: 0,Author
176,jose jaime camacho escoto
179,jose rufino diaz uribe
202,lucia medina gomez
263,rosa maria quispe siccha
276,susana ramirez vizcaya


In [3]:
dblp = pd.read_csv("authorships_15_20_example_only.csv")
dblp.head()

Unnamed: 0,id_article,author,year
0,journals/apjor/LuoLFC17,Xiaoxiao Luo,2017
1,journals/apjor/LuoLFC17,Minqiang Li,2017
2,journals/apjor/LuoLFC17,Nan Feng,2017
3,journals/apjor/LuoLFC17,Fuzan Chen,2017
4,journals/apjor/YangKJ19,Feng Yang 0005,2019


## Preprocesamiento de datos

In [4]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    output_str = only_ascii.decode("utf-8")
    return output_str

In [5]:
def preprocesamiento_author_name(data):
    
    data["author"] = data["author"].map(lambda x:x.strip().lower())
    data["author"] = data["author"].map(remove_accents)
    data["author"] = data["author"].map(lambda x:x.replace("-"," "))
    
    return data

In [6]:
dblp = preprocesamiento_author_name(dblp)

In [7]:
dblp.head()

Unnamed: 0,id_article,author,year
0,journals/apjor/LuoLFC17,xiaoxiao luo,2017
1,journals/apjor/LuoLFC17,minqiang li,2017
2,journals/apjor/LuoLFC17,nan feng,2017
3,journals/apjor/LuoLFC17,fuzan chen,2017
4,journals/apjor/YangKJ19,feng yang 0005,2019


## Funcion de busqueda

In [8]:
def validate_sing(df,firma,name):
    try:
        for f in firma.split(" "):
            df = df[df["author"].map(lambda x:True if x.count(f)>0 else False)]
            if df.shape[0] == 0:
                raise
        tam = len(firma.split())
        df = df[df["author"].map(lambda x:True if len(x.split()) == tam else False)]
        if df.shape[0] == 0:
            raise
        df["counter"] = df["author"].map(lambda x:Counter(x.split()))
        count_autor = dict(Counter(firma.split()))
        for f in firma.split(" "):
            df = df[df["counter"].map(lambda x:True if x[f] == count_autor[f] else False)]
            if df.shape[0] == 0:
                raise
        if df.shape[0] > 0:
            df["author_"] = name
            df = df.drop(columns=["counter"])
            reg = df.shape[0]
        else:
            raise
        
    except Exception as e:
        reg = 0
        df = pd.DataFrame(columns = ['id_article', 'author', 'year', 'author_'])
        return reg,df
        
    return reg,df


def find_sings_dblp(data,name):
    
    flag = 0
    d = {}
    lst_df = []
    
    arr_name = name.split(" ")
    
    ####### 2 ##########    
    if len(arr_name) == 2:    
        lst_names = [name]

    ####### 3 #####    
    if len(arr_name) == 3:
        lst_names = [arr_name[0]+" "+arr_name[1]+" "+arr_name[2],
                     arr_name[0]+" "+arr_name[2],
                     arr_name[0]+" "+arr_name[1],
                     arr_name[0]+" "+arr_name[1]+" "+arr_name[2][0] + ".",
                     arr_name[0]+" "+arr_name[1][0] + ". " + arr_name[2],
                     arr_name[0][0]+". "+arr_name[1] + " " +arr_name[2]]
        
    ###### 4 or more ########    
    if len(arr_name) >= 4:
        lst_names = [arr_name[0]+" "+arr_name[1]+" "+arr_name[-2][0] + ".",
                     arr_name[0]+" "+arr_name[1][0] + ". " + arr_name[-2],
                     arr_name[0]+" "+arr_name[-2]+ " "+arr_name[-1],
                     arr_name[1]+" "+arr_name[-2]+ " "+arr_name[-1],
                     arr_name[0][0]+". "+arr_name[1] + " " +arr_name[2],
                     arr_name[0][0]+". "+" ".join(arr_name[1:]),
                     arr_name[0]+" "+arr_name[1][0]+"." +  " ".join(arr_name[2:]),
                     " ".join(arr_name[:-1]),
                     name]
        
    for firma in lst_names:
        df = data.copy()
        reg,aux = validate_sing(df,firma,name)
        d[firma] = reg
        lst_df.append(aux)
        if reg > 0:
            flag = 1
    if flag == 0:
        final_articles = pd.DataFrame(columns = ['id_article', 'author', 'year', 'author_'])
    else:
        final_articles = pd.concat(lst_df,ignore_index=True)
    
    info = {}
    info["Total"] = final_articles.shape[0]
    info["Resumen"] = d
    
    return final_articles,info

## Busqueda de autores

In [9]:
lst_autores = list(autores["Author"])

In [10]:
lst_data = []
d_mapeo = {}
for indice,name in enumerate(lst_autores):

    data,info = find_sings_dblp(dblp,name)
    if data.shape[0] > 0:
        #print("****")
        print(f"{indice}   {name}:{data.shape[0]}")
    
    d_mapeo[name] = info
    lst_data.append(data)

33   andrew adamatzky:1
60   carlos gershenson garcia:2
115   francisco hernandez quiroz:1
143   hyobin kim:1
278   tom froese:2


In [14]:
with open('sings_example_unam.json', 'w') as outfile:
    json.dump(d_mapeo, outfile,indent=2)

In [15]:
filter_dblp = pd.concat(lst_data,ignore_index=True)

In [17]:
filter_dblp.to_csv("Authors_UNAM_dblp.csv",index=False)

In [18]:
filter_dblp

Unnamed: 0,id_article,author,year,author_
0,journals/alife/Adamatzky15,andrew adamatzky,2015,andrew adamatzky
1,journals/alife/GershensonTWS20,carlos gershenson,2020,carlos gershenson garcia
2,journals/alife/Siqueiros-Garcia18,carlos gershenson,2018,carlos gershenson garcia
3,journals/alife/Hernandez-Orozco18,francisco hernandez quiroz,2018,francisco hernandez quiroz
4,journals/alife/KimS18,hyobin kim,2018,hyobin kim
5,journals/alife/TaylorBCABBDFHI16,tom froese,2016,tom froese
6,journals/alife/Siqueiros-Garcia18,tom froese,2018,tom froese
