# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## Load Data

In [2]:
ds = load_dataset("OpenAssistant/oasst1")

Found cached dataset parquet (C:/Users/timon/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
df = ds["validation"].to_pandas()

## Preprocess Data

Remove deleted entries

In [4]:
df = df[df.deleted == False]

Remove entries without labels

In [5]:
df = df[~df.labels.isnull()]

Extract the humor label

In [6]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [7]:
df = df.drop(columns=[
    'parent_id',
    'user_id',
    'created_date',
    'rank',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'message_tree_id',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Preview Data

In [8]:
df.head()

Unnamed: 0,message_id,text,role,lang,review_count,humor
0,68489e5c-978f-4ad7-a849-39a741fb5ae7,"Напиши функцию на языке swift, которая сортиру...",prompter,ru,3,0.0
1,a38b3d1c-256f-47c5-ab68-c9ab766bad84,"Вот функция, которая сортирует массив целых чи...",assistant,ru,3,0.0
2,9cf52b01-0582-46aa-9ad2-70827dce87ed,"Вот функция на языке Swift, которая сортирует ...",assistant,ru,3,0.0
3,9685d79e-16d7-4f1b-b68f-a1ff73e87bef,func sortAndPrintArray(array: [Int]) {\n le...,assistant,ru,3,0.0
4,afee0474-c84a-4ae2-a9c9-8d575256312e,"Inventa un monstruo altamente complejo, descrí...",prompter,es,3,0.25


In [9]:
df.shape

(4333, 6)

## Export

In [10]:
df.to_csv('../data/oasst1_test_cleaned.csv', index=False)

In [11]:
df[df.lang == 'en'].to_csv('../data/oasst1_test_en_cleaned.csv', index=False)