# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## Load Data

In [2]:
ds = load_dataset("OpenAssistant/oasst1")

Found cached dataset parquet (C:/Users/timon/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
df = ds["train"].to_pandas()

## Preprocess Data

Remove deleted entries

In [4]:
df = df[df.deleted == False]

Remove entries without labels

In [5]:
df = df[~df.labels.isnull()]

Extract the humor label

In [6]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [7]:
df = df.drop(columns=[
    'parent_id',
    'user_id',
    'created_date',
    'rank',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'review_count',
    'message_tree_id',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Preview Data

In [8]:
df.head()

Unnamed: 0,message_id,text,role,lang,humor
0,6ab24d72-0181-4594-a9cd-deaf170242fb,Can you write a short introduction about the r...,prompter,en,0.333333
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,"""Monopsony"" refers to a market structure where...",assistant,en,0.375
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,Now explain it to a dog,prompter,en,0.625
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,Monopsony is a market structure in which there...,assistant,en,0.166667
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,How can one fight back when a monospony had be...,prompter,en,0.0


In [9]:
df.shape

(82933, 5)

## Export

In [14]:
df.to_csv('./data/oasst1_train_cleaned.csv', index=False)

In [15]:
pd.read_csv('./data/oasst1_train_cleaned.csv')

Unnamed: 0,message_id,text,role,lang,humor
0,6ab24d72-0181-4594-a9cd-deaf170242fb,Can you write a short introduction about the r...,prompter,en,0.333333
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,"""Monopsony"" refers to a market structure where...",assistant,en,0.375000
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,Now explain it to a dog,prompter,en,0.625000
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,Monopsony is a market structure in which there...,assistant,en,0.166667
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,How can one fight back when a monospony had be...,prompter,en,0.000000
...,...,...,...,...,...
82928,47ba67f2-f711-4feb-91ff-0a0ffe556fe5,James Potter eta Lily Potter ziren Harry Potte...,assistant,eu,0.083333
82929,d3946880-4402-42a8-8a9e-6dab5efa787f,Nork jarri zion ahotsa Dragoi Bolako Vegetari ...,prompter,eu,0.333333
82930,4e8a349d-7781-42ae-8868-1ecf5df4c2cb,Dragoi bolaren aurreneko denboraldian ez da Ve...,assistant,eu,0.500000
82931,ca45d4bd-2da7-4d56-beec-bf9f772e8426,Dragoi Bolako Vegetari euskarazko ahotsa jarri...,assistant,eu,0.000000
