# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## Load Data

In [2]:
ds = load_dataset("OpenAssistant/oasst1")

Found cached dataset parquet (C:/Users/timon/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
df = ds["train"].to_pandas()

## Preprocess Data

Remove deleted entries

In [4]:
len(df[df.deleted == True])

1485

In [5]:
df = df[df.deleted == False]

Remove entries without labels

In [6]:
df = df[~df.labels.isnull()]

Extract the humor label

In [7]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [8]:
df = df.drop(columns=[
    'parent_id',
    'user_id',
    'created_date',
    'rank',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'message_tree_id',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Preview Data

In [9]:
df.head()

Unnamed: 0,message_id,text,role,lang,review_count,humor
0,6ab24d72-0181-4594-a9cd-deaf170242fb,Can you write a short introduction about the r...,prompter,en,3,0.333333
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,"""Monopsony"" refers to a market structure where...",assistant,en,3,0.375
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,Now explain it to a dog,prompter,en,3,0.625
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,Monopsony is a market structure in which there...,assistant,en,3,0.166667
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,How can one fight back when a monospony had be...,prompter,en,3,0.0


In [10]:
df.shape

(82933, 6)

## Export

In [11]:
df.to_csv('../data/oasst1_train_cleaned.csv', index=False)

In [12]:
df[df.lang == 'en'].to_csv('../data/oasst1_train_en_cleaned.csv', index=False)