# Getting started with OpenAssistant OASST1 data

- https://huggingface.co/datasets/OpenAssistant/oasst1

This Notebook is based on https://github.com/LAION-AI/Open-Assistant/blob/main/notebooks/openassistant-oasst1/getting-started.ipynb

## Imports

In [1]:
import pandas as pd
from datasets import load_dataset
from treelib import Tree

# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)


def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")

## Load Data

In [2]:
# load dataset from huggingface datasets
ds = load_dataset("OpenAssistant/oasst1")
print(ds)

Found cached dataset parquet (C:/Users/timon/.cache/huggingface/datasets/OpenAssistant___parquet/OpenAssistant--oasst1-2960c57d7e52ab15/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
})


## Create Pandas Dataframe

In [3]:
# lets convert the train dataset to a pandas df
df = ds["train"].to_pandas()

In [4]:
# look at the df info
df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [5]:
# look at a sample row in a json format we can easily read
df.sample(1).transpose().to_dict()

{39976: {'message_id': 'b7475e01-bafb-464d-9ad4-b8815ca12bfe',
  'parent_id': '336c70b3-1a4b-4947-84b4-ff498401ffa8',
  'user_id': 'c95e790f-31f0-484e-8e2b-e4dada729949',
  'created_date': '2023-02-08T09:02:25.842465+00:00',
  'text': 'write code by python for password generator',
  'role': 'prompter',
  'lang': 'en',
  'review_count': 3,
  'review_result': True,
  'deleted': False,
  'rank': nan,
  'synthetic': False,
  'model_name': None,
  'detoxify': {'toxicity': 0.0002032520860666409,
   'severe_toxicity': 0.00017065198335330933,
   'obscene': 0.0008090694900602102,
   'identity_attack': 0.00030782169778831303,
   'insult': 0.0006429299246519804,
   'threat': 0.0001306796766584739,
   'sexual_explicit': 8.379059727303684e-05},
  'message_tree_id': 'fc2cdb27-8ae7-476d-ac34-e0a46e7e5fdc',
  'tree_state': 'ready_for_export',
  'emojis': None,
  'labels': {'name': array(['spam', 'lang_mismatch', 'pii', 'not_appropriate', 'hate_speech',
          'sexual_content', 'quality', 'toxicity'

## Random Message Tree

In [6]:
# lets grab a random message tree
message_tree_id = df["message_tree_id"].sample(1).values[0]
print(message_tree_id)

d3675c3c-d39e-4928-8d21-222d75963b4d


In [7]:
# look at all data for this message tree
df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")

# add tree level to df
df_message_tree = add_tree_level(df_message_tree)

df_message_tree

Unnamed: 0,message_id,parent_id,user_id,created_date,text,role,lang,review_count,review_result,deleted,rank,synthetic,model_name,detoxify,message_tree_id,tree_state,emojis,labels,tree_level
0,d3675c3c-d39e-4928-8d21-222d75963b4d,,4b4928ba-870f-4fe5-825a-4317ddebe4fe,2023-02-17T08:34:51.125013+00:00,make this statement better : Easy accessibilit...,prompter,en,3,True,False,,False,,"{'toxicity': 0.00034451059764251113, 'severe_t...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['+1', '_skip_reply'], 'count': [4, 2]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",0
1,f6837ead-c18a-44f0-aeaf-68caa7112842,d3675c3c-d39e-4928-8d21-222d75963b4d,d3536142-bdfa-4de4-a443-b63c310e9613,2023-04-07T23:24:03.005771+00:00,"To improve public health, the government shoul...",assistant,en,3,True,False,2.0,False,,"{'toxicity': 0.0003368183970451355, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['+1', '_skip_reply'], 'count': [1, 1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
2,1de96d1b-9dff-45db-89a2-e01ebe3f0572,d3675c3c-d39e-4928-8d21-222d75963b4d,3c46c1f7-50f1-4f07-8886-a54b1f841cb1,2023-04-07T23:41:58.765428+00:00,Here is a revised version of the sentence that...,assistant,en,4,True,False,1.0,False,,"{'toxicity': 0.00034181828959845006, 'severe_t...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [2...","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
3,1e97f980-9e31-4cd7-b89c-d689b79f62a7,d3675c3c-d39e-4928-8d21-222d75963b4d,c2c7eef0-ae98-4e6d-9af7-8b64dd548f5e,2023-04-08T00:04:22.429362+00:00,Here's a few variations on that statement that...,assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.0002850019955076277, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['+1', '_skip_reply'], 'count': [3, 3]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",1
4,99896fca-cf92-4717-b466-ec1dd0f33f49,1e97f980-9e31-4cd7-b89c-d689b79f62a7,85bac698-17f0-4f11-ad4a-32d0b0b1bfcb,2023-04-08T02:48:33.188951+00:00,What is the logic behind easy accessibility to...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0003718062653206289, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['+1', '_skip_reply', '_skip_ranking'...","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
5,92757fff-9748-4d39-9060-0e4d234f85bb,1de96d1b-9dff-45db-89a2-e01ebe3f0572,d421dda2-66dc-46e3-8837-bf3e743287dd,2023-04-08T08:13:23.464493+00:00,What are some ways I might rephrase the statem...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0003339180548209697, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,,"{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
6,b23e8afd-6fbd-47e3-ab4e-c5b13014287c,f6837ead-c18a-44f0-aeaf-68caa7112842,85d3a95c-018d-4970-a410-d90f90d3b908,2023-04-08T08:53:53.327880+00:00,Now write it as a short paragraph of around 50...,prompter,en,3,True,False,,False,,"{'toxicity': 0.0006392703508026898, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['_skip_labeling'], 'count': [2]}","{'name': ['spam', 'lang_mismatch', 'pii', 'not...",2
7,4c9e7636-9bd3-4bd4-b7bd-ddcaace0cb0c,99896fca-cf92-4717-b466-ec1dd0f33f49,4dcf588c-b26e-45fe-90dd-862ec0fadda0,2023-04-08T13:53:34.739595+00:00,The logic behind easy accessibility to healthc...,assistant,en,3,True,False,0.0,False,,"{'toxicity': 0.0013847266091033816, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['+1', '_skip_labeling'], 'count': [3...","{'name': ['spam', 'fails_task', 'lang_mismatch...",3
8,842ce521-0bcc-48db-bd9a-dc36fa30e884,99896fca-cf92-4717-b466-ec1dd0f33f49,46fe4ac9-b71d-451e-9b94-25606ce979fc,2023-04-08T14:04:13.987169+00:00,Easy accessibility to healthcare can lead to i...,assistant,en,3,True,False,1.0,False,,"{'toxicity': 0.0002885847061406821, 'severe_to...",d3675c3c-d39e-4928-8d21-222d75963b4d,ready_for_export,"{'name': ['_skip_labeling'], 'count': [1]}","{'name': ['spam', 'fails_task', 'lang_mismatch...",3


## Create Message Tree

In [8]:
# lets create a tree of message ids
id_tree = Tree()
# lets create a tree of message texts
text_tree = Tree()
# lets set a max char length for the text
max_char_len = 100

# iterate over rows in df_message_tree
for i, row in df_message_tree.iterrows():
    # grab the message_id, parent_id, text, and parent text
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    text_short = text[:max_char_len] if len(text) > max_char_len else text
    text_short = text_short.replace("\n", " ")
    parent_text = (
        df_message_tree.query(f"message_id == '{parent_id}'")["text"].values[0] if parent_id is not None else "ROOT"
    )
    parent_text_short = parent_text[:max_char_len] if len(parent_text) > max_char_len else parent_text
    parent_text_short = parent_text_short.replace("\n", " ")

    # create a node in the id_tree and text_tree, add row as data in case want it later
    id_tree.create_node(message_id, message_id, parent=parent_id, data=row.to_dict())

    # if parent_id is None, then it is a root message so dont add parent text as is none
    if parent_id is None:
        text_tree.create_node(text_short, text_short)
    # else use the parent text short as the parent
    else:
        text_tree.create_node(text_short, text_short, parent=parent_text_short)


print("id_tree:")
id_tree.show()

print("text_tree:")
text_tree.show()

id_tree:
d3675c3c-d39e-4928-8d21-222d75963b4d
├── 1de96d1b-9dff-45db-89a2-e01ebe3f0572
│   └── 92757fff-9748-4d39-9060-0e4d234f85bb
├── 1e97f980-9e31-4cd7-b89c-d689b79f62a7
│   └── 99896fca-cf92-4717-b466-ec1dd0f33f49
│       ├── 4c9e7636-9bd3-4bd4-b7bd-ddcaace0cb0c
│       └── 842ce521-0bcc-48db-bd9a-dc36fa30e884
└── f6837ead-c18a-44f0-aeaf-68caa7112842
    └── b23e8afd-6fbd-47e3-ab4e-c5b13014287c

text_tree:
make this statement better : Easy accessibility to healthcare should be a priority for the governmen
├── Here is a revised version of the sentence that is easier to read: Governments should provide easy ac
│   └── What are some ways I might rephrase the statement in order to make it a persuasive argument and a me
├── Here's a few variations on that statement that might be considered better:  - Improving overall publ
│   └── What is the logic behind easy accessibility to healthcare leading to improvement on overall public h
│       ├── Easy accessibility to healthcare can lead to 

In [10]:
df.synthetic.unique()

array([False])

In [11]:
df.deleted.value_counts()

False    82952
True      1485
Name: deleted, dtype: int64

In [12]:
df['rank'].value_counts()

0.0     17972
1.0     17971
2.0     11463
3.0       963
4.0       234
5.0        72
6.0        27
7.0        13
8.0         6
9.0         3
10.0        1
11.0        1
12.0        1
13.0        1
14.0        1
15.0        1
Name: rank, dtype: int64

In [13]:
df['lang'].value_counts()

en       39283
es       22763
ru        7242
zh        3314
de        3050
fr        2474
th        1460
pt-BR     1165
ca        1158
uk-UA      587
it         554
ja         363
pl         304
eu         250
vi         191
hu          75
ar          56
da          44
tr          37
ko          24
fi          18
id          12
cs          12
sv           1
Name: lang, dtype: int64