# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

## Load Data

In [2]:
ds = load_dataset("OpenAssistant/oasst1", split='train')

Found cached dataset parquet (C:/Users/micha/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
df = ds.to_pandas()

## Preprocess Data

Remove deleted entries

In [4]:
df = df[df.deleted == False]

Remove entries without labels

In [5]:
df = df[~df.labels.isnull()]

Keep only english data

In [6]:
df = df[df.lang == 'en']

Extract the humor label

In [7]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [9]:
df = df.drop(columns=[
    'user_id',
    'created_date',
    'rank',
    'lang',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'review_count',
    'message_tree_id',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Assistant

In [10]:
df_assistant = df[df.role == 'assistant']

In [11]:
df_assistant.shape

(23499, 5)

Get context

In [12]:
def find_context(value):
    try:
        parent = df[df['message_id'] == value].iloc[0]
        return find_context(parent['parent_id']) + " " + parent['text']
    except:
        return ''

In [13]:
df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)


In [14]:
df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)


In [15]:
df_assistant = df_assistant.drop(columns=[
    'message_id',
    'parent_id',
    'role'
]).reset_index(drop=True)

In [16]:
df_assistant[df_assistant.humor < 0.3]

Unnamed: 0,target,humor,context
1,Monopsony is a market structure in which there...,0.166667,Can you write a short introduction about the ...
2,Monopsony refers to a market structure where t...,0.000000,Can you write a short introduction about the ...
3,Here are some potential regulatory options to ...,0.000000,Can you write a short introduction about the ...
4,Regulatory intervention can be used to address...,0.000000,Can you write a short introduction about the ...
5,"Yes, that's correct. Keeping the code for the ...",0.125000,Can you write a short introduction about the ...
...,...,...,...
23493,The most straightforward way to increase the v...,0.000000,What are the main factors that contribute to ...
23494,As a single person your influence on things li...,0.000000,What are the main factors that contribute to ...
23496,Not executing an ankle-brachial index (ABI) ex...,0.000000,What are the risks when you do not carry out ...
23497,There are several risks associated with not pe...,0.000000,What are the risks when you do not carry out ...


## Preview Data

In [17]:
df_assistant.head()

Unnamed: 0,target,humor,context
0,"""Monopsony"" refers to a market structure where...",0.375,Can you write a short introduction about the ...
1,Monopsony is a market structure in which there...,0.166667,Can you write a short introduction about the ...
2,Monopsony refers to a market structure where t...,0.0,Can you write a short introduction about the ...
3,Here are some potential regulatory options to ...,0.0,Can you write a short introduction about the ...
4,Regulatory intervention can be used to address...,0.0,Can you write a short introduction about the ...


In [18]:
df_assistant.shape

(23499, 3)

## Export

In [19]:
df_assistant.to_csv('data/cleaned_with_context.csv', index=False)