
# IMPORTANDO ARQUIVOS BIBTEX e EXPORTANDO EM FORMATAÇÃO YAML
- Arquivo texto em formato BIBTEX
- Objetivo: Data quality de arquivos BIBtex
- Extensão: .bib
- Biblioteca: pybtex

In [257]:

from pybtex.database.input import bibtex
from pybtex.database import BibliographyData, Entry
import os

# #################################################################
## Inicia importanção do arquivo
# #################################################################

In [258]:
def arquivos(path, fendwith = ''):
    if fendwith not in '':
        caminhos = {nome : os.path.join(path, nome) for nome in os.listdir(path) if nome.endswith(fendwith)}
    else:
        caminhos = {nome : os.path.join(path, nome) for nome in os.listdir(path)}
    return caminhos

def author_names(author):
    try:
        return author.persons['author'][0:]
    except:
        return {'author': [(u'none, none')]}

def join_names(person):
    try:
        return person.last_names[0] + ', ' + person.first_names[0]
    except:
        return []

def type_output_file(f_type, filename, path):
    
    import yaml
    import json
    import csv

    path_complete = path + filename + '.' + f_type

    if f_type == 'yaml':
        with open(path_complete, 'w') as nfile:
            for data in dict.items():
                yaml.dump(data, nfile)  # insere os dados na configuração YAML

    if f_type == 'json':
        with open(path_complete, 'w') as nfile:
            for data in dict.items():
                json.dump(data, nfile)  # insere os dados na configuração YAML

    if f_type == 'csv':
        with open(path_complete, 'w', -1, "utf-8") as nfile:
        #head
            nfile.write('{0}\n'.format('§ '.join(str(x) for x in list_file_fields)))
            #row
            for row in dict.keys():
                # print(dict[row])
                nfile.write('{0}\n'.format('§ '.join(str(x) for x in dict[row].values())))


## Inicia tratamento dos dados BIBTEX


ref bibtex doc: http://paginapessoal.utfpr.edu.br/jamhour/publicacoes/arquivos/00_Compilado_JabRef_dez2015.pdf





In [259]:
source_path = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\source\\"

lst_files = arquivos(source_path, '.bib')
parser = bibtex.Parser()
# file = parser.parse_file(lst_files['ACM.bib'])
dict_file_fields = {}
list_file_fields = []

#verifica a estrutura de campos para cada tipo de publicação
#cria um dicionario usando como chave a fonte e o tipo de publicação
for f in lst_files:
    parser = bibtex.Parser()
    file = parser.parse_file(lst_files[f])
    f_name = f.replace('.bib','')
    for i in file.entries.values():
        
        for a in sorted(list(i.fields.keys())):
           
            if a not in list_file_fields:
                list_file_fields.append(a)
list_file_fields.remove('ISSN')

In [260]:
lst_files = arquivos(source_path)
parser = bibtex.Parser()
# file = parser.parse_file(lst_files['IEEE.bib'])
dict = {}

for f in lst_files:
    
    ############# IEEE
    if 'IEEE' in f :
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                        for fields in list_file_fields}
            
            dict[key]['type_publication'] = i.type
            dict[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]

            dict[key].update({'keywords' : dict[key]['keywords'].split(';')})
    
    ############## ACM
    if 'acm' in f:
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                        for fields in list_file_fields}
            
            dict[key]['type_publication'] = i.type
            dict[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]

            dict[key].update({'keywords' : dict[key].get('keywords', '').split(';')})

            if i.type == 'inbook':
                dict[key].update({'doi' : dict[key].get('url', '').replace('https://doi.org/', '')})

    ############## Science Direct
    if 'ScienceDirect' in f:
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                        for fields in list_file_fields}
            
            dict[key]['type_publication'] = i.type
            dict[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]
            dict[key].update({'keywords' : dict[key]['keywords'].split(';')})
            dict[key].update({'doi' : i.fields['doi'].replace('https://doi.org/', '')})


In [261]:
print(dict['10.1145/3502771.3502781'])
# print({key : dict['10.1145/3411764.3445130'][key] for key in ['author', 'title', 'keywords', 'year', 'type_publication', 'doi']})

{'abstract': 'Cyber-physical systems (CPS) have been developed in many industrial sectors and application domains in which the quality requirements of data acquired are a common factor. Data quality in CPS can deteriorate because of several factors such as sensor faults and failures due to operating in harsh and uncertain environments. How can software engineering and artificial intelligence (AI) help manage and tame data quality issues in CPS? This is the question we aimed to investigate in the SEA4DQ workshop. Emerging trends in software engineering need to take data quality management seriously as CPS are increasingly datacentric in their approach to acquiring and processing data along the edge-fog-cloud continuum. This workshop provided researchers and practitioners a forum for exchanging ideas, experiences, understanding of the problems, visions for the future, and promising solutions to the problems in data quality in CPS. Examples of topics include software/hardware architecture

## Output

In [262]:

output_path = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\output\\"

type_output_file('csv', 'output', output_path)

## Inicia tratamento dos dados CSV

In [263]:
import pandas as pd
import re
import hashlib

source_excel = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\source\\excel\\"
lst_excel = arquivos(source_excel)

def rmscaract(text):
    result = [re.sub(r"[^a-zA-Z0-9]","", normalize('NFKD', words).encode('ASCII','ignore').decode('ASCII').lower()) for words in text]
    return result

def hashkey(strkeys):
    stringadjs = rmscaract(strkeys)
    hashresult = [hashlib.md5(ikeys.encode()).hexdigest() for ikeys in stringadjs]
    return hashresult
    

In [278]:
dfjcs = pd.read_csv(lst_excel['jcs_2020.csv'], delimiter=';').drop_duplicates()

dfjcs = dfjcs[['Rank', 'Full Journal Title','Total Cites', 'Journal Impact Factor', 'Eigenfactor Score']]

dfjcs['ID'] =  hashkey(dfjcs['Full Journal Title'])

dfjcs['ID'].count()

12312

In [279]:
dfscimago = pd.read_csv(lst_excel['scimagojr 2020.csv'], delimiter=';').drop_duplicates()

dfscimago['ID'] =  hashkey(dfscimago['Title'])
# dfscimago['issnkey'] =  dfscimago['Issn'].replace(',', '')

dfscimago['ID'].count()

  dfscimago = pd.read_csv(lst_excel['scimagojr 2020.csv'], delimiter=';').drop_duplicates()


32952

In [280]:
dfjoin = dfjcs.merge(dfscimago, left_on=['ID'], right_on=['ID'],how='outer')

dfjoin[['Full Journal Title','Title']].count()

Full Journal Title    12325
Title                 32953
dtype: int64

In [284]:
dfjoin.query('ID == "0332c7885a2d9aab68a0e23797804576"')

Unnamed: 0,Rank_x,Full Journal Title,Total Cites,Journal Impact Factor,Eigenfactor Score,ID,Rank_y,Sourceid,Title,Type,...,Total Refs.,Total Cites (3years),Citable Docs. (3years),Cites / Doc. (2years),Ref. / Doc.,Country,Region,Publisher,Coverage,Categories
2214,2324.0,SCIENCE EDUCATION,6562,4.593,0.00397,0332c7885a2d9aab68a0e23797804576,575.0,23626.0,Science Education,journal,...,3360.0,782.0,158.0,453,8615,United States,Northern America,Wiley-Liss Inc.,1930-2020,Education (Q1); History and Philosophy of Scie...
2215,7390.0,Science & Education,1588,2.114,0.00132,0332c7885a2d9aab68a0e23797804576,575.0,23626.0,Science Education,journal,...,3360.0,782.0,158.0,453,8615,United States,Northern America,Wiley-Liss Inc.,1930-2020,Education (Q1); History and Philosophy of Scie...


In [283]:
dup = dfjoin.groupby('ID').count()
display(dup.query('Issn == 2'))

Unnamed: 0_level_0,Rank_x,Full Journal Title,Total Cites,Journal Impact Factor,Eigenfactor Score,Rank_y,Sourceid,Title,Type,Issn,...,Total Refs.,Total Cites (3years),Citable Docs. (3years),Cites / Doc. (2years),Ref. / Doc.,Country,Region,Publisher,Coverage,Categories
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0332c7885a2d9aab68a0e23797804576,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
06b095b2a66594c6d44d0a851915589b,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
09c68831108ccf2db33965049e8f3e77,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,0,2,2
0a4afb62d070038e592d30ac7506647a,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,1,2,2
2298212571a0a041848fa0cc8a1da3c2,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
23028451d2f36dccb077e51f22d9f59c,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,0,2,2
318488a77c3d884c8762283e7db20e61,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
41fcb72c4a491f1fcf32798c310a2d84,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
51e61ad6b640b69f24e92d4fbce9e391,0,0,0,0,0,2,2,2,2,2,...,2,2,2,2,2,2,2,0,2,2
5d554bc5f3d2cd182cdd0952b1fb87ca,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
