In [32]:
import pandas as pd

#Carregando o DataFrame do RawData 
input_path = r'C:\Users\Raissa Rabello\Desktop\revistas_brasileiras.csv'

try:
    #carregar os dados previamentes salvos
    df = pd.read_csv(input_path)
    print(f'Arquivos carregados com sucesso.')
except FileNotFoundError:
    print(f'O arquivo {input_path} não foi encontrado.')
    raise

Arquivos carregados com sucesso.


In [33]:
#Selecionar apenas as colunas que gostaria
colunas_desejadas = ['id', 'issn_l', 'display_name','host_organization_name','works_count', 'cited_by_count', 
                     'country_code', 'type', 'updated_date', 'created_date']

#verificando se todas as colunas existem no DF
colunas_existentes = [col for col in colunas_desejadas if col in df.columns]

if len(colunas_existentes) != len(colunas_desejadas):
    print('As seguintes colunas não foram encontradas no DF:',
        set(colunas_desejadas) - set(colunas_existentes))
        
#filtrar o DF para manter apenas as colunas desejadas
df_filter = df[colunas_existentes]


In [34]:
# Exibir o DataFrame filtrado em formato tabular no Jupyter
df_filter.head()  # Exibe as primeiras linhas em formato tabular


Unnamed: 0,id,issn_l,display_name,host_organization_name,works_count,cited_by_count,country_code,type,updated_date,created_date
0,https://openalex.org/S2739064124,2525-3409,Research Society and Development,Grupo de Pesquisa Metodologias em Ensino e Apr...,28397,44752,BR,journal,2024-09-29T17:28:43.911814,2017-07-31
1,https://openalex.org/S4210234566,2525-8761,Brazilian Journal of Development,Brazilian Journal of Development,23599,16619,BR,journal,2024-09-27T08:36:50.978538,2022-02-03
2,https://openalex.org/S4210195735,0034-8007,Revista de Direito Administrativo,Fundacao Getulio Varagas,13375,5498,BR,journal,2024-09-27T15:30:40.343761,2022-02-03
3,https://openalex.org/S166523383,0004-282X,Arquivos de Neuro-Psiquiatria,Thieme Medical Publishers (Germany),9411,88346,BR,journal,2024-09-27T10:01:20.666471,2016-06-24
4,https://openalex.org/S150752939,0100-204X,Pesquisa Agropecuária Brasileira,Embrapa Informação Tecnológica,9274,101275,BR,journal,2024-09-27T11:00:44.075300,2016-06-24


In [35]:
# Verificar os tipos de dados das colunas
print(df_filter.dtypes)


id                        object
issn_l                    object
display_name              object
host_organization_name    object
works_count                int64
cited_by_count             int64
country_code              object
type                      object
updated_date              object
created_date              object
dtype: object


In [36]:
#Analisando quantos nulos tem em cada coluna
print(df_filter.isnull().sum())

id                           0
issn_l                      91
display_name                 0
host_organization_name    1886
works_count                  0
cited_by_count               0
country_code                 0
type                         0
updated_date                 0
created_date                 0
dtype: int64


In [42]:
# Criar uma cópia do DataFrame original e filtrar as colunas desejadas
df_filter = df.copy()[['id', 'issn_l', 'display_name', 'works_count', 'cited_by_count', 'country_code', 'type', 'updated_date', 'created_date']]

# Adicionar a coluna 'date_today' com a data de hoje
df_filter['date_today'] = pd.Timestamp('today').date()

# Verifique as primeiras linhas para confirmar
print(df_filter.head())


                                 id     issn_l  \
0  https://openalex.org/S2739064124  2525-3409   
1  https://openalex.org/S4210234566  2525-8761   
2  https://openalex.org/S4210195735  0034-8007   
3   https://openalex.org/S166523383  0004-282X   
4   https://openalex.org/S150752939  0100-204X   

                        display_name  works_count  cited_by_count  \
0   Research Society and Development        28397           44752   
1   Brazilian Journal of Development        23599           16619   
2  Revista de Direito Administrativo        13375            5498   
3      Arquivos de Neuro-Psiquiatria         9411           88346   
4   Pesquisa Agropecuária Brasileira         9274          101275   

  country_code     type                updated_date created_date  date_today  
0           BR  journal  2024-09-29T17:28:43.911814   2017-07-31  2024-09-30  
1           BR  journal  2024-09-27T08:36:50.978538   2022-02-03  2024-09-30  
2           BR  journal  2024-09-27T15:30:40.34

In [43]:
# Verifique as primeiras linhas para confirmar
df_filter.head()

Unnamed: 0,id,issn_l,display_name,works_count,cited_by_count,country_code,type,updated_date,created_date,date_today
0,https://openalex.org/S2739064124,2525-3409,Research Society and Development,28397,44752,BR,journal,2024-09-29T17:28:43.911814,2017-07-31,2024-09-30
1,https://openalex.org/S4210234566,2525-8761,Brazilian Journal of Development,23599,16619,BR,journal,2024-09-27T08:36:50.978538,2022-02-03,2024-09-30
2,https://openalex.org/S4210195735,0034-8007,Revista de Direito Administrativo,13375,5498,BR,journal,2024-09-27T15:30:40.343761,2022-02-03,2024-09-30
3,https://openalex.org/S166523383,0004-282X,Arquivos de Neuro-Psiquiatria,9411,88346,BR,journal,2024-09-27T10:01:20.666471,2016-06-24,2024-09-30
4,https://openalex.org/S150752939,0100-204X,Pesquisa Agropecuária Brasileira,9274,101275,BR,journal,2024-09-27T11:00:44.075300,2016-06-24,2024-09-30


In [44]:
# Salvar o novo DataFrame filtrado em um novo arquivo CSV
output_path_tratado = r'C:\Users\Raissa Rabello\Desktop\revistas_brasileiras_tratadas.csv'

if not df_filter.empty:
    df_filter.to_csv(output_path_tratado, index=False, encoding='utf-8')
    print(f"Dados tratados salvos em '{output_path_tratado}'")
else:
    print("Nenhum dado para salvar após o tratamento.")

Dados tratados salvos em 'C:\Users\Raissa Rabello\Desktop\revistas_brasileiras_tratadas.csv'
