In [40]:
# instalar todas as libs do requirements.txt
%pip freeze -r requirements.txt

[0maiohttp==3.11.16
aiosignal==1.3.2
[0mannotated-types==0.7.0
anyio==4.9.0
[0masttokens==3.0.0
[0mattrs==25.3.0
[0mcertifi==2025.1.31
[0mcharset-normalizer==3.4.1
[0mcomm==0.2.2
[0mdatasets==2.15.0
debugpy==1.8.14
decorator==5.2.1
[0mdill==0.3.7
distro==1.9.0
[0mexecuting==2.2.0
[0mfilelock==3.18.0
[0mfrozenlist==1.5.0
fsspec==2023.10.0
[0mh11==0.14.0
[0mhttpcore==1.0.8
[0mhttpx==0.28.1
[0mhuggingface-hub==0.30.2
[0midna==3.10
[0mipykernel==6.29.5
ipython==9.1.0
[0mjedi==0.19.2
[0mjiter==0.9.0
[0mjupyter_client==8.6.3
jupyter_core==5.7.2
[0mmatplotlib-inline==0.1.7
[0mmultidict==6.4.3
multiprocess==0.70.15
[0mnest-asyncio==1.6.0
[0mnumpy==2.2.4
[0mopenai==1.74.0
[0mpackaging==24.2
pandas==2.2.3
[0mparso==0.8.4
[0mpexpect==4.9.0
[0mplatformdirs==4.3.7
[0mprompt_toolkit==3.0.51
[0mpsutil==7.0.0
ptyprocess==0.7.0
[0mpure_eval==0.2.3
[0mpyarrow==19.0.1
pyarrow-hotfix==0.6
[0mpydantic==2.11.3
pydantic_core==2.33.1
[0mPygments==2.19.1
[0mpython-dateutil=

In [41]:
# instalar o datasets da huggingface
%pip install datasets==2.15.0

Note: you may need to restart the kernel to use updated packages.


In [42]:
import json
from datasets import load_dataset

In [43]:
datasets = load_dataset('hate-speech-portuguese/hate_speech_portuguese',split='train[:10%]')

In [30]:
# print dataset
print(datasets)

Dataset({
    features: ['text', 'label', 'hatespeech_G1', 'annotator_G1', 'hatespeech_G2', 'annotator_G2', 'hatespeech_G3', 'annotator_G3'],
    num_rows: 567
})


In [44]:
datasets = datasets.remove_columns(['hatespeech_G1', 'annotator_G1', 'hatespeech_G2', 'annotator_G2', 'hatespeech_G3', 'annotator_G3'])

print(datasets)

Dataset({
    features: ['text', 'label'],
    num_rows: 567
})


In [45]:
datasets = datasets.train_test_split(test_size=0.2) # método para testar o dataset

In [46]:
# removar os /n das mensagems

def removeN(example):
    example['text'] = example['text'].replace('\n', '')
    return example

# aplicando a função para cada exemplo do dataset

datasets = datasets.map(removeN)

datasets

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

Map: 100%|██████████| 453/453 [00:00<00:00, 7419.60 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 6020.30 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 453
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 114
    })
})

In [47]:
# label 0 = No Hate Speech
# label 1 = Hate Speech

def labelChange(example):
    example['label_text'] = 'No Hate Speech' if example['label'] == 0 else 'Hate Speech' 
    return example

In [48]:
# aplicando a função para cada exemplo do dataset
datasets = datasets.map(labelChange)

# removendo a coluna label
datasets = datasets.remove_columns(['label'])

Map: 100%|██████████| 453/453 [00:00<00:00, 12201.83 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 7099.07 examples/s]


In [49]:
# Adequar o dataset para o formato do modelo

def dataset_to_json(dataset, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for example in dataset:
            json_obj = { "messages": [
               { "role": "system", "content":"Seu trabalho é classificar os comentários dos usuários em Hate Speech ou No Hate Speech"},
               { "role": "user", "content": example['text']},
               { "role": "assistant", "content": example['label_text']}
              ]
            }
            f.write(json.dumps(json_obj, ensure_ascii=False)+"\n")

In [37]:
# construção o arquivo jsonl
dataset_to_json(datasets['train'], 'train.jsonl')

In [50]:
# criação do arquivo de validação (output)
dataset_to_json(datasets['test'], 'validation.jsonl')

In [51]:
# fine-tunning job
from openai import OpenAI
import os

In [None]:
os.environ['OPENAI_API_KEY'] = 'sk-...'

In [None]:
client = OpenAI()

In [None]:
# upload do arquivo de treino

trainingFile = client.files.create(
    file=open('train.jsonl', 'rb'),
    purpose='fine-tune'
)

In [None]:
# upload do arquivo de validação

validationFile = client.files.create(
    file=open('validation.jsonl', 'rb'),
    purpose='fine-tune'
)

In [None]:
client.fine_tunning.create(
    training_file=trainingFile['id'],
    validation_file=validationFile['id'],
    model="gpt-3.5-turbo"
)