# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [3]:
ds = load_dataset("OpenAssistant/oasst1", split='validation')

Found cached dataset parquet (C:/Users/oliko/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [4]:
df = ds.to_pandas()

## Preprocess Data

Remove deleted entries

In [5]:
df = df[df.deleted == False]

Remove entries without labels

In [6]:
df = df[~df.labels.isnull()]

Keep only english data

In [7]:
df = df[df.lang == 'en']

Extract the humor label

In [8]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [10]:
df = df.drop(columns=[
    'user_id',
    'created_date',
    'rank',
    'lang',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'review_count',
    'message_tree_id',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Assistant

In [11]:
df_assistant = df[df.role == 'assistant']

In [12]:
df_assistant.shape

(1215, 5)

Get context

In [13]:
def find_context(value):
    try:
        parent = df[df['message_id'] == value].iloc[0]
        return find_context(parent['parent_id']) + parent['text']
    except:
        return ''

In [14]:
df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)


In [15]:
df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)


In [16]:
df_assistant = df_assistant.drop(columns=[
    'message_id',
    'parent_id',
    'role'
]).reset_index(drop=True)

In [17]:
df_assistant[df_assistant.humor < 0.3]

Unnamed: 0,target,humor,context
0,"As an open source alternative to ChatGPT, I do...",0.125000,What do you think about ChatGPT?
1,As a large language model trained on text from...,0.000000,What do you think about ChatGPT?As an open sou...
2,I think that AI should be free and kept away f...,0.000000,What do you think about ChatGPT?As an open sou...
5,Here are some differences between me and ChatG...,0.000000,What do you think about ChatGPT?ChatGPT was th...
6,"I, OpenAssistant, will be different because I ...",0.083333,What do you think about ChatGPT?ChatGPT was th...
...,...,...,...
1207,'Perfect information' in this context means th...,0.000000,I've recently started playing the turn-based s...
1208,Into the Breach is a game with almost perfect ...,0.250000,I've recently started playing the turn-based s...
1209,"In the context of the game ""Into the Breach"", ...",0.000000,I've recently started playing the turn-based s...
1210,"In the context of Into the Breach, ""perfect in...",0.000000,I've recently started playing the turn-based s...


## Preview Data

In [18]:
df_assistant.head()

Unnamed: 0,target,humor,context
0,"As an open source alternative to ChatGPT, I do...",0.125,What do you think about ChatGPT?
1,As a large language model trained on text from...,0.0,What do you think about ChatGPT?As an open sou...
2,I think that AI should be free and kept away f...,0.0,What do you think about ChatGPT?As an open sou...
3,ChatGPT is an artificial intelligence chatbot ...,0.416667,What do you think about ChatGPT?
4,"ChatGPT was the inspiration for my creation, b...",0.666667,What do you think about ChatGPT?


In [19]:
df_assistant.shape

(1215, 3)

## Export

In [20]:
df_assistant.to_csv('data/cleaned_with_context_test.csv', index=False)

In [None]:
df_assistant.to_csv('data/cleaned_with_context_test.csv', index=False)