# Data Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
ds = load_dataset("OpenAssistant/oasst1", split='validation')

Found cached dataset parquet (/home/jovyan/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
df = ds.to_pandas()

In [4]:
df.head()

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels
0,68489e5c-978f-4ad7-a849-39a741fb5ae7,,845cba79-9d16-4eb8-ab63-982fe8403c62,2023-02-08T11:55:51.852473+00:00,"Напиши функцию на языке swift, которая сортиру...",prompter,ru,3,True,False,,False,,"{'toxicity': 0.0025964330416172743, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."
1,a38b3d1c-256f-47c5-ab68-c9ab766bad84,68489e5c-978f-4ad7-a849-39a741fb5ae7,abbc8a37-2db7-4af5-9df1-b0582d2e3d93,2023-03-16T22:29:28.829265+00:00,"Вот функция, которая сортирует массив целых чи...",assistant,ru,3,True,False,0.0,False,,"{'toxicity': 0.0003634887107182294, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
2,9cf52b01-0582-46aa-9ad2-70827dce87ed,68489e5c-978f-4ad7-a849-39a741fb5ae7,a3e19dd4-34cd-48f0-8912-a2003b22c334,2023-02-11T09:57:39.207727+00:00,"Вот функция на языке Swift, которая сортирует ...",assistant,ru,3,True,False,1.0,False,,"{'toxicity': 0.0004027303075417876, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['+1'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch..."
3,9685d79e-16d7-4f1b-b68f-a1ff73e87bef,68489e5c-978f-4ad7-a849-39a741fb5ae7,97894afc-2204-4c7b-9b91-f1c71c10f032,2023-02-09T21:25:39.116134+00:00,func sortAndPrintArray(array: [Int]) {\n le...,assistant,ru,3,True,False,2.0,False,,"{'toxicity': 0.0003712645557243377, 'severe_to...",68489e5c-978f-4ad7-a849-39a741fb5ae7,ready_for_export,"{'name': ['-1', '_skip_labeling'], 'count': [1...","{'name': ['spam', 'fails_task', 'lang_mismatch..."
4,afee0474-c84a-4ae2-a9c9-8d575256312e,,0a8f6864-6cda-4cf5-bc07-346470d6d493,2023-02-07T00:54:42.098108+00:00,"Inventa un monstruo altamente complejo, descrí...",prompter,es,3,True,False,,False,,"{'toxicity': 0.029511522501707077, 'severe_tox...",afee0474-c84a-4ae2-a9c9-8d575256312e,ready_for_export,"{'name': ['+1', '-1', '_skip_reply'], 'count':...","{'name': ['spam', 'lang_mismatch', 'pii', 'not..."


## Preprocess Data

Remove deleted entries

In [5]:
df = df[df.deleted == False]

Remove entries without labels

In [6]:
df = df[~df.labels.isnull()]

Extract the humor label

In [7]:
def get_humor_value(x):
    index_of_humor = np.where(x['name'] == 'humor')[0]
    if len(index_of_humor) > 0:
        return x['value'][index_of_humor[0]]
    
    return 0

df['humor'] = df['labels'].apply(get_humor_value)

Drop unncessary columns

In [8]:
df = df.drop(columns=[
    'user_id',
    'created_date',
    'rank',
    'lang',
    'synthetic',
    'model_name',
    'tree_state',
    'deleted',
    'detoxify',
    'review_count',
    'review_result',
    'emojis',
    'labels'
]).reset_index(drop=True)

## Assistant

In [9]:
df_assistant = df[df.role == 'assistant']

In [10]:
df_assistant.shape

(2704, 6)

Get context

In [11]:
def find_context(value):
    global count_no_parents
    try:
        parent = df[df['message_id'] == value].iloc[0]
        if parent['parent_id'] == None:
            return " " + parent['text']
        else:
            return find_context(parent['parent_id']) + " " + parent['text']
    except:
        # Parent id available, but no parent -> end of tree
        return ' '

In [12]:
df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant['prompt'] = df_assistant['parent_id'].map(find_context)


In [13]:
df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_assistant.rename(columns={"prompt": "context", "text": "target"}, inplace=True)


In [14]:
df_assistant = df_assistant.drop(columns=[
    'message_id',
    'parent_id',
    'role'
]).reset_index(drop=True)

## Preview Data

In [15]:
df_assistant.head()

Unnamed: 0,target,message_tree_id,humor,context
0,"Вот функция, которая сортирует массив целых чи...",68489e5c-978f-4ad7-a849-39a741fb5ae7,0.0,"Напиши функцию на языке swift, которая сортир..."
1,"Вот функция на языке Swift, которая сортирует ...",68489e5c-978f-4ad7-a849-39a741fb5ae7,0.0,"Напиши функцию на языке swift, которая сортир..."
2,func sortAndPrintArray(array: [Int]) {\n le...,68489e5c-978f-4ad7-a849-39a741fb5ae7,0.0,"Напиши функцию на языке swift, которая сортир..."
3,Aquí te presento mi creación de un monstruo al...,afee0474-c84a-4ae2-a9c9-8d575256312e,0.125,"Inventa un monstruo altamente complejo, descr..."
4,"El monstruo que inventaré se llama ""Cyclopsaur...",afee0474-c84a-4ae2-a9c9-8d575256312e,0.25,"Inventa un monstruo altamente complejo, descr..."


In [16]:
df_assistant.shape

(2704, 4)

## Export

In [17]:
df_assistant.to_csv('data/cleaned_with_context_test.csv', index=False)