In [1]:
import pandas as pd
import numpy as np

In [21]:
df = pd.read_csv('metadados_completos_sol_sbc.csv')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29678 entries, 0 to 29677
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      29678 non-null  object
 1   Category   29678 non-null  object
 2   URL_Title  29678 non-null  object
 3   Authors    29678 non-null  object
 4   Event      29678 non-null  object
 5   Date       29678 non-null  object
 6   Box        29678 non-null  object
 7   Abstract   29677 non-null  object
 8   Keywords   29678 non-null  object
 9   Publisher  29678 non-null  object
 10  URL_Paper  29678 non-null  object
dtypes: object(11)
memory usage: 2.5+ MB


## Pré-processamento

In [22]:
def normalizar_autores(campo):
    if pd.isna(campo):
        return campo

    autores = campo.split(";")
    autores_formatados = []

    for autor in autores:
        autor = autor.strip()
        if "," in autor:
            sobrenome_final, restante = autor.split(",", 1)
            nome_completo = " ".join(
                (restante.strip() + " " + sobrenome_final.strip()).split()
            )
            autores_formatados.append(nome_completo)
        else:
            autores_formatados.append(autor)

    return "; ".join(autores_formatados)

df["autores_tratados"] = df["Authors"].apply(normalizar_autores)

In [16]:
pip install gender-guesser

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import gender_guesser.detector as gender

detector = gender.Detector(case_sensitive=False)

def analisar_genero_autores(campo):
    # Caso NaN
    if pd.isna(campo):
        return pd.Series([None, 0, 0])

    autores = [a.strip() for a in campo.split(";") if a.strip()]

    # Caso string vazia ou sem autores válidos
    if not autores:
        return pd.Series([None, 0, 0])

    primeiros_nomes = [autor.split()[0] for autor in autores]
    generos = [detector.get_gender(nome) for nome in primeiros_nomes]

    # Sexo do primeiro autor
    sexo_primeiro_autor = generos[0]

    # Quantidade de autoras (female + mostly_female)
    qtd_feminino = sum(g in {"female", "mostly_female"} for g in generos)

    # Quantidade de unknown
    qtd_unknown = generos.count("unknown")

    return pd.Series([
        sexo_primeiro_autor,
        qtd_feminino,
        qtd_unknown
    ])

In [24]:
df[
    [
        "Sexo Primeiro Autor",
        "Qtd Autores Feminino",
        "Qtd Autores Unknown"
    ]
] = df["autores_tratados"].apply(analisar_genero_autores)

In [25]:
df.head()

Unnamed: 0,Title,Category,URL_Title,Authors,Event,Date,Box,Abstract,Keywords,Publisher,URL_Paper,autores_tratados,Sexo Primeiro Autor,Qtd Autores Feminino,Qtd Autores Unknown
0,A DAG-Based Post-Quantum Ledger,ANAIS DE EVENTO,https://sol.sbc.org.br/index.php/ladc_estendid...,"Freitas, Allan Edgard Silva",COMPANION PROCEEDINGS OF THE LATIN-AMERICAN SY...,2025-10-27,Caixa 1,Quantum computing threatens foundational crypt...,['No keywords available'],PDF (English),https://sol.sbc.org.br/index.php/ladc_estendid...,Allan Edgard Silva Freitas,male,0.0,0.0
1,AI resources governance with OpenDID: Strategy...,ANAIS DE EVENTO,https://sol.sbc.org.br/index.php/ladc_estendid...,"Chacón, Lenin; Moraga, Kevin",COMPANION PROCEEDINGS OF THE LATIN-AMERICAN SY...,2025-10-27,Caixa 1,The accelerated adoption of artificial intelli...,"['Artificial Intelligence (AI)', 'Decentralize...",PDF (English),https://sol.sbc.org.br/index.php/ladc_estendid...,Lenin Chacón; Kevin Moraga,unknown,0.0,1.0
2,Digital Academic Certification with Blockchain...,ANAIS DE EVENTO,https://sol.sbc.org.br/index.php/ladc_estendid...,"Blanco, Pablo; Betarte, Gustavo; Luna, Carlos;...",COMPANION PROCEEDINGS OF THE LATIN-AMERICAN SY...,2025-10-27,Caixa 1,Academic certificates are essential credential...,"['Academic Certification', 'Blockchain', 'Hype...",PDF (English),https://sol.sbc.org.br/index.php/ladc_estendid...,Pablo Blanco; Gustavo Betarte; Carlos Luna; Ma...,male,1.0,0.0
3,An API-Driven Framework for Performance Testin...,ANAIS DE EVENTO,https://sol.sbc.org.br/index.php/ladc_estendid...,"Cardoso, Carlos; Silva, Caio; Veloso, Alan; So...",COMPANION PROCEEDINGS OF THE LATIN-AMERICAN SY...,2025-10-27,Caixa 1,As Distributed Ledger Technologies (DLTs) matu...,"['Performance Testing', 'Blockchain', 'Apache ...",PDF (English),https://sol.sbc.org.br/index.php/ladc_estendid...,Carlos Cardoso; Caio Silva; Alan Veloso; Jeffs...,male,0.0,2.0
4,Enhancing Data Provenance in mHealth: An Archi...,ANAIS DE EVENTO,https://sol.sbc.org.br/index.php/ladc_estendid...,"Velasco, Gislainy Crisostomo; Vaz, Noeli Antôn...",COMPANION PROCEEDINGS OF THE LATIN-AMERICAN SY...,2025-10-27,Caixa 1,The advancement of digitalization in healthcar...,"['Data Provenance', 'W3C PROV', 'mHealth', 'Bl...",PDF (English),https://sol.sbc.org.br/index.php/ladc_estendid...,Gislainy Crisostomo Velasco; Noeli Antônia Pim...,unknown,0.0,2.0


In [26]:
df.to_csv('metadados_completos_sol_sbc_com_genero.csv', index=False)