In [None]:
import pandas as pd
import numpy as np

# Filtrar

## Carga inicial

In [None]:
df_scopus_p1=pd.read_csv('data/Scopus_P1.csv')
df_scopus_p2=pd.read_csv('data/Scopus_P2.csv')
df_scopus_p3=pd.read_csv('data/Scopus_P3.csv')
df_scopus_p4=pd.read_csv('data/Scopus_P4.csv')

df_ieee_p1=pd.read_csv('data/IEEE_P1.csv')
df_ieee_p2=pd.read_csv('data/IEEE_P2.csv')
df_ieee_p3=pd.read_csv('data/IEEE_P3.csv')
df_ieee_p4=pd.read_csv('data/IEEE_P4.csv')

df_wos_p1=pd.read_csv('data/WoS_P1.csv')
df_wos_p2=pd.read_csv('data/WoS_P2.csv')
df_wos_p3=pd.read_csv('data/WoS_P3.csv')
df_wos_p4=pd.read_csv('data/WoS_P4.csv')

#"Key","Item Type","Publication Year","Author","Title","Publication Title","ISBN","ISSN","DOI","Url","Abstract Note"

columns=['Key','Item Type','Publication Year','Author','Title','Publication Title','ISBN','ISSN','DOI','Url','Abstract Note']

df_scopus_p1 = df_scopus_p1[columns]
df_scopus_p2 = df_scopus_p2[columns]
df_scopus_p3 = df_scopus_p3[columns]
df_scopus_p4 = df_scopus_p4[columns]

df_ieee_p1 = df_ieee_p1[columns]
df_ieee_p2 = df_ieee_p2[columns]
df_ieee_p3 = df_ieee_p3[columns]
df_ieee_p4 = df_ieee_p4[columns]

df_wos_p1 = df_wos_p1[columns]
df_wos_p2 = df_wos_p2[columns]
df_wos_p3 = df_wos_p3[columns]
df_wos_p4 = df_wos_p4[columns]



In [None]:
df_joined=pd.concat([df_scopus_p1,df_scopus_p2,df_scopus_p3,df_scopus_p4,df_ieee_p1,df_ieee_p2,df_ieee_p3,df_ieee_p4,df_wos_p1,df_wos_p2,df_wos_p3,df_wos_p4])

#Order by title
df_joined = df_joined.sort_values(by=['Title'])

print("Before dropping duplicates by title: {}".format(df_joined.shape))
# Sort by 'Url' column, rows with NaN values will be first
df_joined = df_joined.sort_values('Url', na_position='first')
# Drop duplicates, keep the last occurrence (which will be the one with a URL if it exists)
df_joined = df_joined.drop_duplicates(subset=['Title'], keep='last')
print("After dropping duplicates by title: {}".format(df_joined.shape))

#add extra column with lower case title

df_joined['Title_lower'] = df_joined['Title'].str.lower()

#Remove duplicates by title_lower

print("Before dropping duplicates by title_lower: {}".format(df_joined.shape))
df_joined = df_joined.drop_duplicates(subset=['Title_lower'])
print("After dropping duplicates by title_lower: {}".format(df_joined.shape))

#Remove duplicates by DOI

# print("Before dropping duplicates by DOI: {}".format(df_joined.shape))
# df_joined = df_joined.drop_duplicates(subset=['DOI'])
# print("After dropping duplicates by DOI: {}".format(df_joined.shape))

In [None]:
#Drop ISSN and ISBN
df_joined = df_joined.drop(['ISSN','ISBN'], axis=1)
df_joined.head()

In [None]:
#Save to csv
df_joined = df_joined.sort_values(by=['Title', 'Author'])
df_joined.to_csv('data/merged.csv',index=False)

## Filtros


In [None]:
df_fuentes=pd.read_csv('data\Formulario de extracción - Filtrado.csv')

criterios=df_fuentes.columns[9:]
df_fuentes[criterios]=df_fuentes[criterios].replace(np.nan, 0)

In [None]:
#Count the number of times that a row has a 1 in any of the criterias
df_fuentes['Criterios']=df_fuentes[criterios].sum(axis=1)
df_fuentes[df_fuentes['Criterios']==0].to_csv('data/filtered.csv',index=False)

# Incluir ACL

## CSV inicial

In [None]:
#Read Bib file from data/anthology+abstracts.bib

#Read file
import bibtexparser
library = bibtexparser.parse_file('data/anthology+abstracts.bib')

In [None]:
#Number of entries
print("Number of entries: {}".format(len(library.entries)))

#93386 entries in ACL Anthology
#We get 92942

#Get list of different fields
fields=[]
for entry in library.entries:
    fields+=entry.fields_dict.keys()

fields=set(fields)
print("Fields: {}".format(fields))

In [None]:
#Create a dataframe with the fields as columns
fields=list(fields)
df_bib=pd.DataFrame(columns=fields)
#Fill the dataframe with the entries
# Fill the list with the entries
data = []
i=0
for entry in library.entries:
    # print("Entry: {}".format(i))
    aux_dict=entry.fields_dict
    #Add entry.
    data.append(entry.fields_dict)
    #Add new pair to the just added dictionary. Key: 'entry_type', Value: entry.entry_type
    data[i]['entry_type']=entry.entry_type
    i+=1

# Convert the list to a DataFrame
df_bib = pd.DataFrame(data)

df_bib.tail()

In [None]:
#For each entry in df_bib, only conserve its value if its of Field type
df_bib = df_bib.applymap(lambda x: x.value if isinstance(x, bibtexparser.model.Field) else x)

mapping_dict = {
    'title': 'Title',
    'author': 'Author',
    'year': 'Publication Year',
    'booktitle': 'Publication Title',
    'doi': 'DOI',
    'url': 'Url',
    'abstract': 'Abstract Note',
    'entry_type': 'Item Type'
}

df_bib = df_bib.rename(columns=mapping_dict)

In [None]:
print("Before dropping before 2018: {}".format(df_bib.shape))

#Drop where year is null or before 2018
df_bib = df_bib.dropna(subset=['Publication Year'])
df_bib = df_bib[df_bib['Publication Year'].astype(int) >= 2018]

print("After dropping before 2018: {}".format(df_bib.shape))

In [None]:
df_bib = df_bib.dropna(subset=['Abstract Note'])
df_bib.shape

In [None]:
#Save to csv
#Drop commas in abstracts and authors

df_bib.to_csv('data/anthology+abstracts.csv',index=False)

## Cadenas

In [None]:
# df_bib = pd.read_csv('data/anthology+abstracts.csv')

In [None]:
#I want to search this query in title or abstract
#( "NLP" OR "natural language processing" ) AND ("Low resource language?" OR "Endangered language?" OR "Indigenous language?" OR "Native language?" OR "Threatened language?") AND ("Text" OR "Sentences" OR "Words") AND ("Generat*" OR "Creat*") AND ("Architecture" OR "models" OR "network")

import re

# Define the search patterns
# Define the search patterns
patternsP1 = [
    # r'\b(nlp|natural language processing)\b',
    r'\b(low resource languages?|endangered languages?|indigenous languages?|native languages?|threatened languages?)\b',
    r'\b(texts?|sentences?|words?)\b',
    r'\b(generat.*|creat.*)\b',
    r'\b(architecture|models?|networks?)\b'
]

patternsP2 = [
    # r'\b(nlp|natural language processing)\b',
    r'\b(low resource languages?|endangered languages?|indigenous languages?|native languages?|threatened languages?)\b',
    r'\b(texts?|sentences?|words?)\b',
    r'\bdata\b',
    r'\b(techniques?|methodolog.*|solutions?)\b',
    r'\b(generat.*|creat.*)\b'
]

patternsP3 = [
    # r'\b(nlp|natural language processing)\b',
    r'\b(low resource languages?|endangered languages?|indigenous languages?|native languages?|threatened languages?)\b',
    r'\b(texts?|sentences?|words?)\b',
    r'\baugmenta.*\b'
]

patternsP4 = [
    # r'\b(nlp|natural language processing)\b',
    r'\b(low resource languages?|endangered languages?|indigenous languages?|native languages?|threatened languages?)\b',
    r'\b(texts?|sentences?|words?)\b',
    r'\b(generat.*|creat.*)\b',
    r'\b(evaluat.*|assess.*|test.*)\b'
]

# Apply the search patterns to the 'title' and 'abstract' columns
df_bib['SearchP1'] = df_bib.apply(lambda row: all(re.search(pattern, row['Title'].lower() + ' ' + row['Abstract Note'].lower()) for pattern in patternsP1), axis=1)
df_bib['SearchP2'] = df_bib.apply(lambda row: all(re.search(pattern, row['Title'].lower() + ' ' + row['Abstract Note'].lower()) for pattern in patternsP2), axis=1)
df_bib['SearchP3'] = df_bib.apply(lambda row: all(re.search(pattern, row['Title'].lower() + ' ' + row['Abstract Note'].lower()) for pattern in patternsP3), axis=1)
df_bib['SearchP4'] = df_bib.apply(lambda row: all(re.search(pattern, row['Title'].lower() + ' ' + row['Abstract Note'].lower()) for pattern in patternsP4), axis=1)

In [None]:
df_bib_p1 = df_bib[df_bib['SearchP1']] 
df_bib_p2 = df_bib[df_bib['SearchP2']]
df_bib_p3 = df_bib[df_bib['SearchP3']]
df_bib_p4 = df_bib[df_bib['SearchP4']]

dfs = [df_bib_p1, df_bib_p2, df_bib_p3, df_bib_p4]
for i, df in enumerate(dfs):
    print("P{}: {}".format(i+1, df.shape))
    df.to_csv('data/anthology+abstracts_P{}.csv'.format(i+1),index=False)

df_bib_concat=pd.concat(dfs)

In [None]:
df_bib_concat.drop_duplicates(subset=['Title'], inplace=True)
df_bib_concat.shape



In [None]:
df_filtered = pd.read_csv('data/filtered.csv')

In [None]:
toConserve=df_filtered.columns[1:9].tolist()
df_bib_concat = df_bib_concat[toConserve]
df_bib_concat.head()

In [None]:
#Save to csv
df_bib_concat.to_csv('data/aclFiltrado.csv',index=False)

# Lista final

In [None]:
df_final=pd.read_csv('data/Formulario de extracción - Formulario.csv')
df_final.head()

In [None]:
#Rename columns
renaming_dict = {
    'Título': 'Title',
    'Autor(es)': 'Author',
    'Año': 'Publication Year',
    'Revista/Conferencia': 'Publication Title',
    'DOI': 'DOI',
    'URL': 'Url',
    'Resumen': 'Abstract Note',
    'Tipo de publicación': 'Item Type',
    'ID': 'Key',
}

renaming_dict = {v: k for k, v in renaming_dict.items()}
df_final = df_final.rename(columns=renaming_dict)

df_final.head()

In [None]:
#Replace ID with REF# 
df_final['ID'] = range(1, len(df_final) + 1)
df_final['ID'] = 'REF' + df_final['ID'].astype(str)


In [None]:
df_final['Tipo de publicación'].value_counts()
renaming_dict = {
    'Artículo en revista': 'journalArticle',
    'Artículo en conferencia': 'conferencePaper',
    'Libro': 'book',
    'Capítulo de libro': 'bookSection',
    'Tesis': 'thesis',
    'Otro': 'other',
}
renaming_dict = {v: k for k, v in renaming_dict.items()}
df_final['Tipo de publicación'] = df_final['Tipo de publicación'].replace(renaming_dict)
df_final['Tipo de publicación'].value_counts()

In [22]:
df_final.to_csv('data/Formulario de extracción - Formulario.csv',index=False)