
# DATA QUALITY (Bibtex e csv)
- Arquivo texto em formato BIBTEX
- Objetivo: Data quality de input diversos(csv, bibtex e APIs)

In [143]:
import os
import pandas as pd
import re
import hashlib
import yaml
import json
import csv
from pybtex.database.input import bibtex
from pybtex.database import BibliographyData, Entry
from unicodedata import normalize


##### Criação de funções

In [144]:
def arquivos(path, fendwith = ''):
    if fendwith not in '':
        caminhos = {nome : os.path.join(path, nome) for nome in os.listdir(path) if nome.endswith(fendwith)}
    else:
        caminhos = {nome : os.path.join(path, nome) for nome in os.listdir(path)}
    return caminhos

def author_names(author):
    try:
        return author.persons['author'][0:]
    except:
        return {'author': [(u'none, none')]}

def join_names(person):
    try:
        return person.last_names[0] + ', ' + person.first_names[0]
    except:
        return []

def type_output_file(o_data, f_type, filename, path):

    path_complete = path + filename + '.' + f_type

    if f_type == 'yaml':
        with open(path_complete, 'w') as nfile:
            yaml.dump(o_data.to_dict(orient='records'), nfile)

    if f_type == 'json':
        with open(path_complete, 'w') as nfile:
            result = o_data.to_json(orient="records")
            parsed = json.loads(result)
            json.dump(parsed, nfile, indent=4)  

    if f_type == 'csv':
        o_data.to_csv(path_complete, sep='§')  

# #################################################################
## Inicia importanção do arquivo
# #################################################################

### importação BIBTEX

ref bibtex doc: http://paginapessoal.utfpr.edu.br/jamhour/publicacoes/arquivos/00_Compilado_JabRef_dez2015.pdf





In [145]:
source_path = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\source\\"

lst_files = arquivos(source_path, '.bib')
parser = bibtex.Parser()
# file = parser.parse_file(lst_files['ACM.bib'])
dict_file_fields = {}
list_file_fields = []

#verifica a estrutura de campos para cada tipo de publicação
#cria um dicionario usando como chave a fonte e o tipo de publicação
for f in lst_files:
    parser = bibtex.Parser()
    file = parser.parse_file(lst_files[f])
    f_name = f.replace('.bib','')
    for i in file.entries.values():
        
        for a in sorted(i.fields.keys()):
           
            if a not in list_file_fields:
                list_file_fields.append(a)
list_file_fields.remove('ISSN')

##### Tratamento BIBTEX

In [146]:
lst_files = arquivos(source_path)
parser = bibtex.Parser()
# file = parser.parse_file(lst_files['IEEE.bib'])
dict_bib = {}

for f in lst_files:
    
    ############# IEEE
    if 'IEEE' in f :
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict_bib[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                            for fields in list_file_fields}
            
            dict_bib[key]['type_publication'] = i.type
            dict_bib[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]

            dict_bib[key].update({'keywords' : dict_bib[key]['keywords'].split(';')})
    
    ############## ACM
    if 'acm' in f:
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict_bib[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                            for fields in list_file_fields}
            
            dict_bib[key]['type_publication'] = i.type
            dict_bib[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]

            dict_bib[key].update({'keywords' : dict_bib[key].get('keywords', '').split(';')})

            if i.type == 'inbook':
                dict_bib[key].update({'doi' : dict_bib[key].get('url', '').replace('https://doi.org/', '')})

    ############## Science Direct
    if 'ScienceDirect' in f:
        parser = bibtex.Parser()
        file = parser.parse_file(lst_files[f])

        for i in file.entries.values():
            key = i.key
            dict_bib[key] = {i.fields.get('fields', fields) : i.fields.get(fields, '')\
                        for fields in list_file_fields}
            
            dict_bib[key]['type_publication'] = i.type
            dict_bib[key]['author'] = [join_names(person) for person in i.persons.get('author', '')]
            dict_bib[key].update({'keywords' : dict_bib[key]['keywords'].split(';')})
            
            dict_bib[key].update({'doi' : i.fields['doi'].replace('https://doi.org/', '')})

df_bib = pd.DataFrame(data=dict_bib.values())

In [147]:
print(dict_bib['10.1145/3502771.3502781'])

{'abstract': 'Cyber-physical systems (CPS) have been developed in many industrial sectors and application domains in which the quality requirements of data acquired are a common factor. Data quality in CPS can deteriorate because of several factors such as sensor faults and failures due to operating in harsh and uncertain environments. How can software engineering and artificial intelligence (AI) help manage and tame data quality issues in CPS? This is the question we aimed to investigate in the SEA4DQ workshop. Emerging trends in software engineering need to take data quality management seriously as CPS are increasingly datacentric in their approach to acquiring and processing data along the edge-fog-cloud continuum. This workshop provided researchers and practitioners a forum for exchanging ideas, experiences, understanding of the problems, visions for the future, and promising solutions to the problems in data quality in CPS. Examples of topics include software/hardware architecture

## Output

In [148]:
with open('config.yaml', 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.SafeLoader)

output_path = config['output_path'][0]
f_name = config['file_name'][0]
file_ext = config['output_ext'][0]

type_output_file(df_bib, file_ext, f_name, output_path)

## Inicia tratamento dos dados CSV

##### Funções para tratamento do csv

In [149]:
source_excel = "C:\\Users\\victo\\PycharmProjects\\BibtexDataQuality\\source\\excel\\"
lst_excel = arquivos(source_excel)

def rmscaract(text):
    result = [re.sub(r"[^a-zA-Z0-9]","", normalize('NFKD', words).encode('ASCII','ignore').decode('ASCII').lower()) for words in text]
    return result

def stringhash(instr):
    hashnum = [hashlib.md5(ikeys.encode()).hexdigest() for ikeys in instr]
    return hashnum

def hashkey(strkeys):
    stringadjs = rmscaract(strkeys)
    hashresult = [hashlib.md5(ikeys.encode()).hexdigest() for ikeys in stringadjs]
    return hashresult

##### importanção e tratamento

##### jcs

In [154]:
dfjcs = pd.read_csv(lst_excel['jcs_2020.csv'], delimiter=';').drop_duplicates()

dfjcs = dfjcs[['Rank', 'Full Journal Title','Total Cites', 'Journal Impact Factor', 'Eigenfactor Score']]
dfjcs['hashid'] =  hashkey(dfjcs['Full Journal Title'])
dfjcs = dfjcs.rename(columns={'Journal Impact Factor' : 'jcr_value'})

dfjcs['hashid'].count()
# dfjcs

12312

##### scimago

In [272]:
import numpy as np

dfscimago = pd.read_csv(lst_excel['scimagojr 2020.csv'], delimiter=';').drop_duplicates()

dfscimago['hashid'] =  hashkey(dfscimago['Title'])
dfscimago['issnkey'] =  dfscimago.Issn.str.slice(stop=8).replace(r'^\s+$', None, regex=True)
dfscimago = dfscimago.rename(columns={'SJR' : 'scimago_value'})

dfscimago['hashid'].count()

dfscimago


  dfscimago = pd.read_csv(lst_excel['scimagojr 2020.csv'], delimiter=';').drop_duplicates()


Unnamed: 0,Rank,Sourceid,Title,Type,Issn,scimago_value,SJR Best Quartile,H index,Total Docs. (2020),Total Docs. (3years),...,Citable Docs. (3years),Cites / Doc. (2years),Ref. / Doc.,Country,Region,Publisher,Coverage,Categories,hashid,issnkey
0,1,28773,Ca-A Cancer Journal for Clinicians,journal,"15424863, 00079235",62937,Q1,168,47,119,...,80,12634,7345,United States,Northern America,Wiley-Blackwell,1950-2020,Hematology (Q1); Oncology (Q1),b5c78db79beb07825e8e8f61d07c63a4,15424863
1,2,19434,MMWR Recommendations and Reports,journal,"10575987, 15458601",40949,Q1,143,10,9,...,9,5000,12920,United States,Northern America,Centers for Disease Control and Prevention (CDC),1990-2020,Epidemiology (Q1); Health Information Manageme...,8cf8e45c2a2e1ed552ed8942836131dd,10575987
2,3,20315,Nature Reviews Molecular Cell Biology,journal,"14710072, 14710080",37461,Q1,431,115,338,...,167,3283,7338,United Kingdom,Western Europe,Nature Publishing Group,2000-2020,Cell Biology (Q1); Molecular Biology (Q1),84dd4c7a6e346225ffb862f41566b4c3,14710072
3,4,29431,Quarterly Journal of Economics,journal,"00335533, 15314650",34573,Q1,259,40,110,...,109,1600,6833,United Kingdom,Western Europe,Oxford University Press,1886-2020,Economics and Econometrics (Q1),637bd8e1943ccd45d086d84bc4fddc5e,00335533
4,5,21100812243,Nature Reviews Materials,journal,20588437,32011,Q1,108,92,264,...,138,3215,11557,United Kingdom,Western Europe,Nature Publishing Group,2016-2020,"Biomaterials (Q1); Electronic, Optical and Mag...",fd2c9ac590c5caf41145070ce35b1310,20588437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32947,32948,25412,Zhonghua kou qiang yi xue za zhi = Zhonghua ko...,journal,10020098,,-,14,150,0,...,0,000,000,China,Asiatic Region,Zhonghua Yixuehui Zazhishe,"1987-2016, 2020",Medicine (miscellaneous),837a54d2c76db8cc29217daa59876507,10020098
32948,32949,21485,Zhonghua liu xing bing xue za zhi = Zhonghua l...,journal,02546450,,-,31,292,0,...,0,000,000,China,Asiatic Region,Zhonghua Yixuehui Zazhishe,"1982-2016, 2020",Medicine (miscellaneous),19adc480aa0632ab145c0f7e27f28a47,02546450
32949,32950,26726,Zhonghua nei ke za zhi [Chinese journal of int...,journal,05781426,,-,18,5,0,...,0,000,000,China,Asiatic Region,Zhonghua Yixuehui Zazhishe,"1957-1959, 1979-1997, 1999-2016, 2020",Medicine (miscellaneous),694b02b4b72dff032cb068e1815c3355,05781426
32950,32951,19324,Zhonghua wai ke za zhi [Chinese journal of sur...,journal,05295815,,-,16,5,0,...,0,000,000,China,Asiatic Region,Zhonghua Yixuehui Zazhishe,"1957, 1959-1964, 1979-2016, 2020",Medicine (miscellaneous),48cceced9cae7e4525e8d514b015b860,05295815


##### junção CSV

In [273]:
dfjournalrank = dfscimago.merge(dfjcs, left_on=['hashid'], right_on=['hashid'],how='outer')

dfjournalrank['Title'] = (dfjournalrank.Title.combine_first(dfjournalrank['Title'])).str.lower()

print(dfjournalrank[['Full Journal Title','Title']].count())

dfjournalrank = dfjournalrank.drop(columns=['Rank_y', 'Full Journal Title'])\
                             .rename(columns={'Rank_x' : 'Rank'})

Full Journal Title    12325
Title                 32953
dtype: int64


In [274]:
dfjournalrank.count().all

<bound method NDFrame._add_numeric_operations.<locals>.all of Rank                      32953
Sourceid                  32953
Title                     32953
Type                      32953
Issn                      32953
scimago_value             32605
SJR Best Quartile         32953
H index                   32953
Total Docs. (2020)        32953
Total Docs. (3years)      32953
Total Refs.               32953
Total Cites (3years)      32953
Citable Docs. (3years)    32953
Cites / Doc. (2years)     32953
Ref. / Doc.               32953
Country                   32953
Region                    32953
Publisher                 26049
Coverage                  32953
Categories                32953
hashid                    34727
issnkey                   32953
Total Cites               12325
jcr_value                 12325
Eigenfactor Score         12325
dtype: int64>

# Unindo todos os dados

In [275]:
def get_first_non_null(dfrow, columns_to_search):
    for c in columns_to_search:
        if pd.notnull(dfrow[c]):
            return dfrow[c]
    return None

In [270]:
# def get_first_non_null(dfrow, columns_to_search):
dfrow = df_bib[df_bib.isbn == '9781450329972']
columns_to_search = ['issnkey', 'issn', 'isbn']
# print(dfrow)
for c in columns_to_search:
    a = pd.isnull(dfrow[c])
    print(a, dfrow[c])
    
    # #         return dfrow[c]
    # print(c)
    # # return None

2    False
Name: issnkey, dtype: bool 2    
Name: issnkey, dtype: object
2    False
Name: issn, dtype: bool 2    
Name: issn, dtype: object
2    False
Name: isbn, dtype: bool 2    9781450329972
Name: isbn, dtype: object


In [276]:
# cols = ['author', 'title', 'keywords', 'year', 'type_publication', 'doi']

cols = ['issnkey', 'issn', 'isbn', 'journal', 'publisher', 'title', 'booktitle', 'doi', 'author']
df_bib = pd.DataFrame(data=dict_bib.values())

df_bib['issnkey'] = rmscaract(df_bib['issn'])

# df_bib['issnkey'] = df_bib.apply(get_first_non_null(df_bib, ['issnkey', 'isbn']))

# display(df_bib[df_bib.issn == '10020098'])

df_bib.get(cols)

Unnamed: 0,issnkey,issn,isbn,journal,publisher,title,booktitle,doi,author
0,01635948,0163-5948,,SIGSOFT Softw. Eng. Notes,Association for Computing Machinery,Software Engineering and AI for Data Quality i...,,10.1145/3502771.3502781,"[Nguyen, Phu, Sen, Sagar, Jourdan, Nicolas, Ca..."
1,01635808,0163-5808,,SIGMOD Rec.,Association for Computing Machinery,Cleanix: A Parallel Big Data Cleaning System,,10.1145/2935694.2935702,"[Wang, Hongzhi, Li, Mingda, Bu, Yingyi, Li, Ji..."
2,,,9781450329972,,Association for Computing Machinery,"""Big Metadata"": The Need for Principled Metada...",Proceedings of Workshop on Data Analytics in t...,10.1145/2627770.2627776,"[Smith, Ken, Seligman, Len, Rosenthal, Arnon, ..."
3,19361955,1936-1955,,J. Data and Information Quality,Association for Computing Machinery,Editorial: Special Issue on Web Data Quality,,10.1145/3005395,"[Bizer, Christian, Dong, Luna, Ilyas, Ihab, Vi..."
4,,,9781450327589,,Association for Computing Machinery,BigDansing: A System for Big Data Cleansing,Proceedings of the 2015 ACM SIGMOD Internation...,10.1145/2723372.2747646,"[Khayyat, Zuhair, Ilyas, Ihab, Jindal, Alekh, ..."
...,...,...,...,...,...,...,...,...,...
6026,00165107,0016-5107,,Gastrointestinal Endoscopy,,Comprehensive review of publicly available col...,,10.1016/j.gie.2022.08.043,"[Houwen, Britt, Nass, Karlijn, Vleugels, Jaspe..."
6027,13618415,1361-8415,,Medical Image Analysis,,Surgical data science – from concepts toward c...,,10.1016/j.media.2021.102306,"[Maier-Hein, Lena, Eisenmann, Matthias, Sarika..."
6028,22149147,2214-9147,,Defence Technology,,Air combat target maneuver trajectory predicti...,,10.1016/j.dt.2022.06.006,"[Zhi-fei, Xi, Ying-xin, Kou, Zhan-wu, Li, Yue,..."
6029,19427867,1942-7867,,Transportation Letters,,A review of the critical elements and developm...,,10.1080/19427867.2020.1759852,"[Emami, Azadeh, Sarvi, Majid, {Asadi Bagloee},..."


In [None]:
df_join_df = df_bib.merge(dfjournalrank, )