# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
ds = load_dataset("OpenAssistant/oasst1", split='train')

Found cached dataset parquet (/home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
df = ds.to_pandas()

In [4]:
df.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,6ab24d72-0181-4594-a9cd-deaf170242fb,,c3fe8c76-fc30-4fa7-b7f8-c492f5967d18,2023-02-05T14:23:50.983374+00:00,Can you write a short introduction about the r...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00044308538781479, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,c8e83833-ecbc-44fe-b6db-735228c25a1c,6ab24d72-0181-4594-a9cd-deaf170242fb,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T13:50:44.657083+00:00,"""Monopsony"" refers to a market structure where...",assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.00026396565954200923, 'severe_t...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,6708c47f-05c9-4346-b3d2-40b2bd24fde4,c8e83833-ecbc-44fe-b6db-735228c25a1c,2c96e467-66f0-4be7-9693-bda51356a424,2023-02-06T18:48:49.391686+00:00,Now explain it to a dog,prompter,en,3,True,False,,False,,"{'toxicity': 0.03648477792739868, 'severe_toxi...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
3,343ee2d4-87ae-41fd-a768-bdd65959dc4a,6ab24d72-0181-4594-a9cd-deaf170242fb,49ddcb0d-6588-43bd-858d-19c577f12e7b,2023-02-06T13:37:56.044680+00:00,Monopsony is a market structure in which there...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0008866374846547842, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_labeling...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,18145bf4-37fd-4ac0-80f5-6108b5f2b365,343ee2d4-87ae-41fd-a768-bdd65959dc4a,e10e99a0-38ac-4b07-bf5d-4427696e4e0d,2023-02-06T18:52:51.428543+00:00,How can one fight back when a monospony had be...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0009362137061543763, 'severe_to...",6ab24d72-0181-4594-a9cd-deaf170242fb,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


## Preprocess Data

Remove deleted entries

In [5]:
df = df[df.deleted == False]

Remove entries without labels

In [6]:
df = df[~df.labels.isnull()]

Extract the humor label

In [7]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [8]:
df = df.drop(columns=[
    'user_id',
    'created_date',
    'rank',
    'lang',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'review_count',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Assistant

In [9]:
df_assistant = df[df.role == 'assistant']

In [10]:
df_assistant.shape

(51802, 6)

Get context

In [11]:
def find_context(value):
    global count_no_parents
    try:
        parent = df[df['message_id'] == value].iloc[0]
        if parent['parent_id'] == None:
            return " " + parent['text']
        else:
            return find_context(parent['parent_id']) + " " + parent['text']
    except:
        # Parent id available, but no parent -> end of tree
        return ' '

In [12]:
df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)


In [13]:
df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)


In [14]:
df_assistant = df_assistant.drop(columns=[
    'message_id',
    'parent_id',
    'role'
]).reset_index(drop=True)

## Preview Data

In [15]:
df_assistant.head()

Unnamed: 0,target,message_tree_id,humor,context
0,"""Monopsony"" refers to a market structure where...",6ab24d72-0181-4594-a9cd-deaf170242fb,0.375,Can you write a short introduction about the ...
1,Monopsony is a market structure in which there...,6ab24d72-0181-4594-a9cd-deaf170242fb,0.166667,Can you write a short introduction about the ...
2,Monopsony refers to a market structure where t...,6ab24d72-0181-4594-a9cd-deaf170242fb,0.0,Can you write a short introduction about the ...
3,Here are some potential regulatory options to ...,6ab24d72-0181-4594-a9cd-deaf170242fb,0.0,Can you write a short introduction about the ...
4,Regulatory intervention can be used to address...,6ab24d72-0181-4594-a9cd-deaf170242fb,0.0,Can you write a short introduction about the ...


In [16]:
df_assistant.shape

(51802, 4)

## Export

In [17]:
df_assistant.to_csv('data/cleaned_with_context.csv', index=False)