In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

## Carrega o csv

In [15]:
data_dir = '../preprocessed_datasets/BRNews/'
file_path = f'{data_dir}/pre-processed.csv'
df = pd.read_csv(file_path)

## Divide o dataset em treinamento, validação e teste

In [16]:
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Adiciona a coluna 'partition' com as partições apropriadas

In [17]:
train_df['partition'] = 'train'
val_df['partition'] = 'val'
test_df['partition'] = 'test'

# Combina os dataframes

In [18]:
combined_df = pd.concat([train_df, val_df, test_df])

## Cria o arquivo .tsv

In [19]:
corpus_df = combined_df[['preprocessed_news', 'partition', 'label']]
corpus_file_path = f'{data_dir}/corpus.tsv'
corpus_df.to_csv(corpus_file_path, sep='\t', index=False, header=False)

# Extrai palavras únicas criando o vocabulário

In [20]:
vocabulary = set()
df['preprocessed_news'].str.split().apply(vocabulary.update)
vocabulary_file_path = f'{data_dir}/vocabulary.txt'
with open(vocabulary_file_path, 'w') as vocab_file:
    vocab_file.write('\n'.join(sorted(vocabulary)))

## Cria o arquivo metadata.json

In [21]:
metadata = {
    "total_documents": len(df),
    "vocabulary_length": len(vocabulary),
    "preprocessing-info": [],
    "labels": sorted(combined_df['label'].unique().tolist()),
    "total_labels": combined_df['label'].nunique(),
    "last-training-doc": int(train_df.index[-1]) + 1,
    "last-validation-doc": int(val_df.index[-1]) + 1,
}

metadata_file_path = f'{data_dir}/metadata.json'
with open(metadata_file_path, 'w') as metadata_file:
    json.dump(metadata, metadata_file, indent=4)

## Carrega conjunto de dados pré-processado

In [22]:
from octis.dataset.dataset import Dataset
dataset = Dataset()
dataset.load_custom_dataset_from_folder(data_dir)

In [23]:
print(dataset)

<octis.dataset.dataset.Dataset object at 0x7bf31e9893a0>
