In [1]:
import pandas as pd
import ast
import spacy
nlp = spacy.load("en_core_web_lg")
from numpy import average, round

In [2]:
import torch
print(f"Available: {torch.cuda.is_available()}")
print(f"How many is available: {torch.cuda.device_count()}")
print(f"Which is available: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
print(f"Device capability: {torch.cuda.get_device_capability(0)}")
print(f"Device memory: {round(torch.cuda.get_device_properties(0).total_memory/1024**3,2)} GB")


Available: True
How many is available: 1
Which is available: 0
Device name: NVIDIA GeForce RTX 4060 Laptop GPU
Device capability: (8, 9)
Device memory: 7.75 GB


In [3]:
def create_results_list(data, true_labels, num_iterations):
    """
    Create a results list based on whether each corresponding true_label is present in the corresponding sublist of the data.

    Parameters:
    - data: List of sublists to search through
    - true_labels: List of true_labels to check for in each sublist
    - num_iterations: Number of iterations to consider for each sublist

    Returns:
    - results_list: List of 1s and 0s based on the presence of true_labels in the sublists

    results_list = []
    for sublist, true_label in zip(data, true_labels):
        results_list.append(1 if true_label in sublist[:num_iterations] else 0)
    """
    results_list = [1 if true_label in sublist[:num_iterations] else 0 for sublist, true_label in zip(data, true_labels)]
    indices = [sublist.index(true_label) if true_label in sublist[:num_iterations] else None for sublist, true_label in zip(data, true_labels)]
    return results_list, indices, round(average(results_list), 3)

def ranking(row, columns):
    """
    Função que realiza o ranqueamento dos tokens em uma linha de dados, com base nas colunas fornecidas.

    Parâmetros:
    - row: dict - Dicionário contendo as informações da linha de dados.
    - columns: list - Lista das colunas que contêm os tokens a serem ranqueados.

    Retorna:
    - list - Lista de dicionários contendo os tokens ranqueados, ordenados pelo score em ordem decrescente.
    """

    tokens_combinados = {}
    for col in columns:
        for token_info in row[col]:
            token = token_info['token']
            score = token_info['score']
            if token in tokens_combinados:
                tokens_combinados[token]['score'] += score
            else:
                tokens_combinados[token] = {'token': token, 'score': score}
    return sorted(list(tokens_combinados.values()), key=lambda x: x['score'], reverse=True)

def ranking_dataframe(df, columns):
    """
    Retorna uma lista de tokens finais para cada linha do dataframe, com base no ranking calculado usando as colunas fornecidas.

    Parâmetros:
    - df: DataFrame - O dataframe de entrada contendo os dados.
    - columns: list - Uma lista de colunas usadas para calcular o ranking.

    Retorno:
    - tokens_finais: list - Uma lista de tokens finais para cada linha do dataframe.
    """
    tokens_finais = [ranking(row, columns) for _, row in df.iterrows()]
    return tokens_finais


# Função para converter uma string para uma lista
def str_to_list(x):
    """
    Converte uma string em uma lista.

    Parâmetros:
    x (str): A string a ser convertida.

    Retorna:
    list: A lista resultante da conversão da string.
          Se a conversão falhar, retorna a string original.
    """
    try:
        return ast.literal_eval(x)
    except (SyntaxError, ValueError):
        return x

# Função para extrair os tokens
def extrair_tokens(lista_dicts):
    """
    Função que extrai os tokens de uma lista de dicionários.

    Parâmetros:
    - lista_dicts (list): Uma lista de dicionários contendo a chave 'token'.

    Retorna:
    - Uma lista contendo os valores da chave 'token' de cada dicionário.

    Exemplo:
    >>> lista = [{'token': 'Olá'}, {'token': 'mundo'}, {'token': '!'}]
    >>> extrair_tokens(lista)
    ['Olá', 'mundo', '!']
    """
    return [item['token'] for item in lista_dicts]

In [4]:
def filtrar_GPE(lista):
    nomes_lugares = []

    for item in lista:
        token = item['token']
        score = item['score']
        doc = nlp(token)
        for ent in doc.ents:
            if ent.label_ == 'GPE':  # GPE representa entidades nomeadas de países, cidades, estados.
                nomes_lugares.append({'token': token, 'score': score})
                break

    return nomes_lugares

def filtrar_ORG(lista):
    nomes_lugares = []

    for item in lista:
        token = item['token']
        score = item['score']
        doc = nlp(token)
        for ent in doc.ents:
            if ent.label_ == 'ORG':  # ORG representa entidades nomeadas de empresas, agências, instituições, etc.
                nomes_lugares.append({'token': token, 'score': score})
                break

    return nomes_lugares

def filtrar_NORP(lista):
    nomes_lugares = []

    for item in lista:
        token = item['token']
        score = item['score']
        doc = nlp(token)
        for ent in doc.ents:
            if ent.label_ == 'NORP':  # GPE representa entidades nomeadas de nacionalidades ou grupos religiosos ou políticos.
                nomes_lugares.append({'token': token, 'score': score})
                break

    return nomes_lugares

def zerando_nao_GPE(lista):
    nomes_lugares = []

    for item in lista:
        token = item['token']
        score = item['score']
        doc = nlp(token)
        token_score = score if any(ent.label_ == 'GPE' for ent in doc.ents) else 0.0
        nomes_lugares.append({'token': token, 'score': token_score})

    # Ordenar a lista pelo score
    nomes_lugares.sort(key=lambda x: x['score'], reverse=True)

    return nomes_lugares

def zerando_nao_ORG(lista):
    nomes_lugares = []

    for item in lista:
        token = item['token']
        score = item['score']
        doc = nlp(token)
        token_score = score if any(ent.label_ == 'ORG' for ent in doc.ents) else 0.0
        nomes_lugares.append({'token': token, 'score': token_score})

    # Ordenar a lista pelo score
    nomes_lugares.sort(key=lambda x: x['score'], reverse=True)

    return nomes_lugares

def zerando_nao_NORP(lista):
    nomes_lugares = []

    for item in lista:
        token = item['token']
        score = item['score']
        doc = nlp(token)
        token_score = score if any(ent.label_ == 'NORP' for ent in doc.ents) else 0.0
        nomes_lugares.append({'token': token, 'score': token_score})

    # Ordenar a lista pelo score
    nomes_lugares.sort(key=lambda x: x['score'], reverse=True)

    return nomes_lugares

In [5]:
bert_pasta_few = '/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/v2/'
roberta_pasta_few = '/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/v2/'
electra_pasta_few = '/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/v2/'

bert_pasta_one = '/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/one/bert/v2/'
roberta_pasta_one = '/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/one/roberta/v2/'
electra_pasta_one = '/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/one/electra/v2/'

bert_pasta = '/home/rafael/tese/code/data_v2/outputs/bert/'
roberta_pasta = '/home/rafael/tese/code/data_v2/outputs/roberta/'
electra_pasta = '/home/rafael/tese/code/data_v2/outputs/electra/'

In [6]:
top_k = 10
dados = {'token': '___', 'score': 0.0}
lista_de_dicionarios = [dados.copy() for _ in range(top_k)]

## Bert

In [20]:
# Born In
bornIn_bert = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/outputs_bornIn_t5_bert_few_shoot.csv')
bornIn_bert.drop(columns=['Unnamed: 0'], inplace=True)
#bornIn = bornIn.iloc[:10]

for i in bornIn_bert.columns:
    bornIn_bert[i] = bornIn_bert[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
bornIn_bert.to_csv(bert_pasta_few + 'outputs_bornIn_t5_bert_few_shoot_v2.csv')

# DiedIn
diedIn_bert = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/outputs_diedIn_t5_bert_few_shoot.csv')
diedIn_bert.drop(columns=['Unnamed: 0'], inplace=True)
#diedIn_bert = diedIn_bert[:10]
for i in diedIn_bert.columns:
    diedIn_bert[i] = diedIn_bert[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
diedIn_bert.to_csv(bert_pasta_few + 'outputs_diedIn_t5_bert_few_shoot_v2.csv')

# Capital
capital_bert = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/outputs_capital_t5_bert_few_shoot.csv')
capital_bert.drop(columns=['Unnamed: 0'], inplace=True)

for i in capital_bert.columns:
    capital_bert[i] = capital_bert[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
capital_bert.to_csv(bert_pasta_few + 'outputs_capital_t5_bert_few_shoot_v2.csv')

# Worksfor
worksfor_bert = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/outputs_worksfor_t5_bert_few_shoot.csv')
worksfor_bert.drop(columns=['Unnamed: 0'], inplace=True)
#worksfor_bert = worksfor_bert[:10]
for i in worksfor_bert.columns:
    worksfor_bert[i] = worksfor_bert[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
worksfor_bert.to_csv(bert_pasta_few + 'outputs_worksfor_t5_bert_few_shoot_v2.csv')

# Citizen
citizen_bert = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/outputs_citizen_t5_bert_few_shoot.csv')
citizen_bert.drop(columns=['Unnamed: 0'], inplace=True)
#citizen_bert = citizen_bert[:10]
for i in citizen_bert.columns:
    citizen_bert[i] = citizen_bert[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
citizen_bert.to_csv(bert_pasta_few + 'outputs_citizen_t5_bert_few_shoot_v2.csv')

In [8]:
# Language
language_bert_one = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/one/bert/outputs_language_t5_bert_one_shoot.csv')
language_bert_one.drop(columns=['Unnamed: 0'], inplace=True)
#language_bert_one = language_bert_one[:10]
for i in language_bert_one.columns:
    language_bert_one[i] = language_bert_one[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_bert_one.to_csv(bert_pasta_one + 'outputs_language_t5_bert_one_shoot_v2.csv')

# Language
language_bert_few = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/bert/outputs_language_t5_bert_few_shoot.csv')
language_bert_few.drop(columns=['Unnamed: 0'], inplace=True)
#language_bert_few = language_bert_few[:10]
for i in language_bert_few.columns:
    language_bert_few[i] = language_bert_few[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_bert_few.to_csv(bert_pasta_few + 'outputs_language_t5_bert_few_shoot_v2.csv')


# Language
language_bert = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/bert/outputs_language_t5_bert.csv')
language_bert.drop(columns=['Unnamed: 0'], inplace=True)
#language_bert = language_bert[:10]
for i in language_bert.columns:
    language_bert[i] = language_bert[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_bert.to_csv(bert_pasta + 'outputs_language_bert_v2.csv')


## Roberta

In [21]:
# Born In
bornIn_roberta = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/outputs_bornIn_t5_roberta_few_shoot.csv')
bornIn_roberta.drop(columns=['Unnamed: 0'], inplace=True)
#bornIn = bornIn.iloc[:10]

for i in bornIn_roberta.columns:
    bornIn_roberta[i] = bornIn_roberta[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
bornIn_roberta.to_csv(roberta_pasta_few + 'outputs_bornIn_t5_roberta_few_shoot_v2.csv')

# DiedIn
diedIn_roberta = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/outputs_diedIn_t5_roberta_few_shoot.csv')
diedIn_roberta.drop(columns=['Unnamed: 0'], inplace=True)
#diedIn_roberta = diedIn_roberta[:10]
for i in diedIn_roberta.columns:
    diedIn_roberta[i] = diedIn_roberta[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
diedIn_roberta.to_csv(roberta_pasta_few + 'outputs_diedIn_t5_roberta_few_shoot_v2.csv')

# Capital
capital_roberta = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/outputs_capital_t5_roberta_few_shoot.csv')
capital_roberta.drop(columns=['Unnamed: 0'], inplace=True)

for i in capital_roberta.columns:
    capital_roberta[i] = capital_roberta[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
capital_roberta.to_csv(roberta_pasta_few + 'outputs_capital_t5_roberta_few_shoot_v2.csv')

# Worksfor
worksfor_roberta = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/outputs_worksfor_t5_roberta_few_shoot.csv')
worksfor_roberta.drop(columns=['Unnamed: 0'], inplace=True)
#worksfor_roberta = worksfor_roberta[:10]
for i in worksfor_roberta.columns:
    worksfor_roberta[i] = worksfor_roberta[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
worksfor_roberta.to_csv(roberta_pasta_few + 'outputs_worksfor_t5_roberta_few_shoot_v2.csv')

# Citizen
citizen_roberta = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/outputs_citizen_t5_roberta_few_shoot.csv')
citizen_roberta.drop(columns=['Unnamed: 0'], inplace=True)
#citizen_roberta = citizen_roberta[:10]
for i in citizen_roberta.columns:
    citizen_roberta[i] = citizen_roberta[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
citizen_roberta.to_csv(roberta_pasta_few + 'outputs_citizen_t5_roberta_few_shoot_v2.csv')

In [9]:
# Language
language_roberta_one = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/one/roberta/outputs_language_t5_roberta_one_shoot.csv')
language_roberta_one.drop(columns=['Unnamed: 0'], inplace=True)
#language_roberta_one = language_roberta_one[:10]
for i in language_roberta_one.columns:
    language_roberta_one[i] = language_roberta_one[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_roberta_one.to_csv(roberta_pasta_one + 'outputs_language_t5_roberta_one_shoot_v2.csv')

# Language
language_roberta_few = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/roberta/outputs_language_t5_roberta_few_shoot.csv')
language_roberta_few.drop(columns=['Unnamed: 0'], inplace=True)
#language_roberta_few = language_roberta_few[:10]
for i in language_roberta_few.columns:
    language_roberta_few[i] = language_roberta_few[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_roberta_few.to_csv(roberta_pasta_few + 'outputs_language_t5_roberta_few_shoot_v2.csv')



# Language
language_roberta = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/roberta/outputs_language_t5_roberta.csv')
language_roberta.drop(columns=['Unnamed: 0'], inplace=True)
#language_roberta = language_roberta[:10]
for i in language_roberta.columns:
    language_roberta[i] = language_roberta[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_roberta.to_csv(roberta_pasta + 'outputs_language_roberta_v2.csv')

## Electra

In [22]:
# Born In
bornIn_electra = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/outputs_bornIn_t5_electra_few_shoot.csv')
bornIn_electra.drop(columns=['Unnamed: 0'], inplace=True)
#bornIn = bornIn.iloc[:10]

for i in bornIn_electra.columns:
    bornIn_electra[i] = bornIn_electra[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
bornIn_electra.to_csv(electra_pasta_few + 'outputs_bornIn_t5_electra_few_shoot_v2.csv')

# DiedIn
diedIn_electra = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/outputs_diedIn_t5_electra_few_shoot.csv')
diedIn_electra.drop(columns=['Unnamed: 0'], inplace=True)
#diedIn_electra = diedIn_electra[:10]
for i in diedIn_electra.columns:
    diedIn_electra[i] = diedIn_electra[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
diedIn_electra.to_csv(electra_pasta_few + 'outputs_diedIn_t5_electra_few_shoot_v2.csv')

# Capital
capital_electra = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/outputs_capital_t5_electra_few_shoot.csv')
capital_electra.drop(columns=['Unnamed: 0'], inplace=True)

for i in capital_electra.columns:
    capital_electra[i] = capital_electra[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
capital_electra.to_csv(electra_pasta_few + 'outputs_capital_t5_electra_few_shoot_v2.csv')

# Worksfor
worksfor_electra = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/outputs_worksfor_t5_electra_few_shoot.csv')
worksfor_electra.drop(columns=['Unnamed: 0'], inplace=True)
#worksfor_electra = worksfor_electra[:10]
for i in worksfor_electra.columns:
    worksfor_electra[i] = worksfor_electra[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
worksfor_electra.to_csv(electra_pasta_few + 'outputs_worksfor_t5_electra_few_shoot_v2.csv')

# Citizen
citizen_electra = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/outputs_citizen_t5_electra_few_shoot.csv')
citizen_electra.drop(columns=['Unnamed: 0'], inplace=True)
#citizen_electra = citizen_electra[:10]
for i in citizen_electra.columns:
    citizen_electra[i] = citizen_electra[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
citizen_electra.to_csv(electra_pasta_few + 'outputs_citizen_t5_electra_few_shoot_v2.csv')

In [10]:
# Language
language_electra_one = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/one/electra/outputs_language_t5_electra_one_shoot.csv')
language_electra_one.drop(columns=['Unnamed: 0'], inplace=True)
#language_electra_one = language_electra_one[:10]
for i in language_electra_one.columns:
    language_electra_one[i] = language_electra_one[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_electra_one.to_csv(electra_pasta_one + 'outputs_language_t5_electra_one_shoot_v2.csv')

# Language
language_electra_few = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/One_Few-Shoot/few/electra/outputs_language_t5_electra_few_shoot.csv')
language_electra_few.drop(columns=['Unnamed: 0'], inplace=True)
#language_electra_few = language_electra_few[:10]
for i in language_electra_few.columns:
    language_electra_few[i] = language_electra_few[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_electra_few.to_csv(electra_pasta_few + 'outputs_language_t5_electra_few_shoot_v2.csv')

# Language
language_electra = pd.read_csv('/home/rafael/tese/code/data/novos_data/dados/outputs/electra/outputs_language_t5_electra.csv')
language_electra.drop(columns=['Unnamed: 0'], inplace=True)
#language_electra = language_electra[:10]
for i in language_electra.columns:
    language_electra[i] = language_electra[i].apply(lambda x: lista_de_dicionarios if pd.isna(x) else x)
language_electra.to_csv(electra_pasta + 'outputs_language_electra_v2.csv')