
# DATA QUALITY (Bibtex e csv)
- Arquivo texto em formato BIBTEX
- Objetivo: Data quality de input diversos(csv, bibtex e APIs)

In [1]:
import os
import pandas as pd
import re
import hashlib
import yaml
import json
import csv
from numpy import nan
from pybtex.database.input import bibtex
from pybtex.database import BibliographyData, Entry
from unicodedata import normalize

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)

##### Criação de funções

In [2]:
######################## LISTAGEM DE ARQUIVOS
def arquivos(path, fendwith = ''):
    if fendwith not in '':
        caminhos = {nome : os.path.join(path, nome) for nome in os.listdir(path) if nome.endswith(fendwith)}
    else:
        caminhos = {nome : os.path.join(path, nome) for nome in os.listdir(path)}
    return caminhos

######################## LISTAGEM DE AUTORES
def author_names(author):
    try:
        return author.persons['author'][0:]
    except:
        return {'author': [(u'none, none')]}

######################## CONCATENA AUTORES
def join_names(person):
    try:
        return person.last_names[0] + ', ' + person.first_names[0]
    except:
        return []

######################## OUTPUT POR TIPO DE EXTENSAO(JSON, YAML E CSV)
def type_output_file(o_data, f_type, filename, path):

    path_complete = path + filename + '.' + f_type

    if f_type == 'yaml':
        with open(path_complete, 'w') as nfile:
            yaml.dump(o_data.to_dict(orient='records'), nfile)

    if f_type == 'json':
        with open(path_complete, 'w') as nfile:
            result = o_data.to_json(orient="records")
            parsed = json.loads(result)
            json.dump(parsed, nfile, indent=4)  

    if f_type == 'csv':
        o_data.to_csv(path_complete, sep='§')

######################## CARGA DO ARQUIVO DE CONFIGURACAO
def load_config(n_file):
    with open(n_file, 'r') as config_file:
        return yaml.load(config_file, Loader=yaml.SafeLoader)

## Inicia importanção dos arquivos

### BIBTEX
- ref bibtex doc: http://paginapessoal.utfpr.edu.br/jamhour/publicacoes/arquivos/00_Compilado_JabRef_dez2015.pdf

In [3]:
source_path = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\source\\"

lst_files = arquivos(source_path, '.bib')
parser = bibtex.Parser()
# file = parser.parse_file(lst_files['ACM.bib'])
dict_file_fields = {}
list_file_fields = []

#verifica a estrutura de campos para cada tipo de publicação
#cria um dicionario usando como chave a fonte e o tipo de publicação
for f in lst_files:
    parser = bibtex.Parser()
    file = parser.parse_file(lst_files[f])
    f_name = f.replace('.bib','')
    for i in file.entries.values():
        
        for a in sorted(i.fields.keys()):
           
            if a not in list_file_fields:
                list_file_fields.append(a)
list_file_fields.remove('ISSN')

##### importanção e tratamento

In [4]:
lst_files = arquivos(source_path)
parser = bibtex.Parser()
# file = parser.parse_file(lst_files['IEEE.bib'])
dict_bib = {}

for f in lst_files:
    
    ############# IEEE
    if 'IEEE' in f :
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict_bib[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                            for fields in list_file_fields}
            
            dict_bib[key]['type_publication'] = i.type
            dict_bib[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]

            dict_bib[key].update({'keywords' : dict_bib[key]['keywords'].split(';')})
    
    ############## ACM
    if 'acm' in f:
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict_bib[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                            for fields in list_file_fields}
            
            dict_bib[key]['type_publication'] = i.type
            dict_bib[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]

            dict_bib[key].update({'keywords' : dict_bib[key].get('keywords', '').split(';')})

            if i.type == 'inbook':
                dict_bib[key].update({'doi' : dict_bib[key].get('url', '').replace('https://doi.org/', '')})

    ############## Science Direct
    if 'ScienceDirect' in f:
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict_bib[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                        for fields in list_file_fields}
            
            dict_bib[key]['type_publication'] = i.type
            dict_bib[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]
            dict_bib[key].update({'keywords' : dict_bib[key]['keywords'].split(';')})
            
            dict_bib[key].update({'doi' : i.fields['doi'].replace('https://doi.org/', '')})

df_bib = pd.DataFrame(data=dict_bib.values())

In [5]:
print(dict_bib['10.1145/3502771.3502781'])

{'abstract': 'Cyber-physical systems (CPS) have been developed in many industrial sectors and application domains in which the quality requirements of data acquired are a common factor. Data quality in CPS can deteriorate because of several factors such as sensor faults and failures due to operating in harsh and uncertain environments. How can software engineering and artificial intelligence (AI) help manage and tame data quality issues in CPS? This is the question we aimed to investigate in the SEA4DQ workshop. Emerging trends in software engineering need to take data quality management seriously as CPS are increasingly datacentric in their approach to acquiring and processing data along the edge-fog-cloud continuum. This workshop provided researchers and practitioners a forum for exchanging ideas, experiences, understanding of the problems, visions for the future, and promising solutions to the problems in data quality in CPS. Examples of topics include software/hardware architecture

#### Output BIBTEX

In [6]:
config = load_config('config.yaml')        

output_path = config['output_path'][0]
f_name = config['file_name'][0]
file_ext = config['output_ext'][0]

type_output_file(df_bib, file_ext, f_name, output_path)

### CSV SCIMAGO | JSR

##### Criando Funções

In [7]:
source_excel = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\source\\excel\\"
lst_excel = arquivos(source_excel)

######################## REMOVE CARACTERES ESPECIAIS
def rmscaract(text):
    result = [re.sub(r"[^a-zA-Z0-9]","", normalize('NFKD', words).encode('ASCII','ignore').decode('ASCII').lower()) for words in text.fillna('')]
    return result

######################## CRIA UM HASH A PARTIR DE UMA STRING
def stringhash(instr):
    hashnum = [hashlib.md5(ikeys.encode()).hexdigest() for ikeys in instr]
    return hashnum

######################## CRIA HASH JÁ COM STRING TRATADA
def hashkey(strkeys):
    stringadjs = rmscaract(strkeys)
    hashresult = [hashlib.md5(ikeys.encode()).hexdigest() for ikeys in stringadjs]
    return hashresult

######################## FUNÇÃO PARA FILTRAGEM
def filter_exp(df):
    a = config['filter_field'][0] + config['search_operator'][0] + "'" + config['search_value'][0] + "'"
    return df.query(a)

##### importanção e tratamento

###### JCS

In [8]:
df_jcs = pd.read_csv(lst_excel['jcs_2020.csv'], delimiter=';').drop_duplicates(subset='Full Journal Title')
cols = ['Rank', 'Full Journal Title','Total Cites', 'Journal Impact Factor', 'Eigenfactor Score']

df_jcs = df_jcs[cols]
df_jcs['hashid'] =  hashkey(df_jcs['Full Journal Title'])
df_jcs = df_jcs.rename(columns={'Journal Impact Factor' : 'jcr_value'})

df_jcs['hashid'].count()
# dfjcs

12312

###### SCIMAGO

In [9]:
df_scimago = pd.read_csv(lst_excel['scimagojr 2020.csv'], delimiter=';').drop_duplicates()

df_scimago['hashid'] =  hashkey(df_scimago['Title'])
df_scimago['issnkey'] =  df_scimago.Issn.str.slice(stop=8)
df_scimago = df_scimago.rename(columns={'SJR' : 'scimago_value'})

df_scimago['hashid'].count()
# dfscimago

  df_scimago = pd.read_csv(lst_excel['scimagojr 2020.csv'], delimiter=';').drop_duplicates()


32952

###### JUNÇÃO JCS | SCIMAGO

In [10]:
dfjournalrank = df_scimago.merge(df_jcs, left_on=['hashid'], right_on=['hashid'],how='outer')

dfjournalrank['Title'] = (dfjournalrank.Title.combine_first(dfjournalrank['Full Journal Title'])).str.lower()
print(dfjournalrank[['Full Journal Title','Title']].count())

dfjournalrank = dfjournalrank.drop(columns=['Rank_x', 'Rank_y', 'Full Journal Title'])
# dfjournalrank

Full Journal Title    12325
Title                 34727
dtype: int64


## JOIN BIBTEX | CSV SCIMAGO | CSV JCS

In [11]:
df_bib = pd.DataFrame(data=dict_bib.values())
df_bib = df_bib.replace(r'^\s*$', nan, regex=True)

df_bib['issnkey'] = rmscaract(df_bib['issn'])
df_bib['title'] = rmscaract(df_bib['title'])
df_bib['hashid'] = hashkey(df_bib['title'])

In [17]:
cols = ['issnkey', 'isbn', 'journal', 'publisher', 'title', 'booktitle', 'doi', 'author', 'keywords', 'abstract', 'year', 'type_publication', 'jcr_value', 'scimago_value']

df_join_bibtex_csv = df_bib.merge(dfjournalrank, left_on = 'issnkey', right_on = 'issnkey', how = 'left' )
df_join_bibtex_csv = df_join_bibtex_csv[cols]

In [16]:
def filter_exp(df):
    a = config['filter_field'][0] + config['search_operator'][0] + "'" + config['search_value'][0] + "'"
    return df.query(a)

In [19]:
config = load_config('config.yaml')        

output_path = config['output_path'][0]
f_name = config['file_name'][0]
file_ext = config['output_ext'][0]

df_filter = filter_exp(df_join_bibtex_csv)

type_output_file(df_filter, file_ext, f_name, output_path)

## API

In [24]:
import requests
import json

accesskey_IEEE = 'efugu53u622asc77hu7h6hbq'
accesskey_SD = 'ef8a7260c27897693d0fd7394a559726'

# def buscar_dados():
request = requests.get("https://ieeexploreapi.ieee.org/api/v1/search/articles?parameter&apikey=" + accesskey_IEEE)
dict_IEEE = json.loads(request.content)
print(type(dict_IEEE))

request = requests.get("http://api.elsevier.com/content/search/scopus?query=heart&apiKey=" + accesskey_SD)
dict_SD = json.loads(request.content)
print(type(dict_SD))   

<class 'dict'>
<class 'dict'>
