# Preparing the dataset

Pulling manual data classification and placing it into train-validate datasets

In [13]:
import pandas as pd

df = pd.read_excel('data/comments_rlhf.xlsx')
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,createdAt,userId,userCreatedAt,classYear,universityId,country,universityName,parentId,...,Puzzled,emotion,educational,giving feedback,asking a question,insulting,supporting,Humour,Frustration,theme
0,0,2,2021-03-25 11:22:56.133000+00:00,1195,2019-03-18 18:41:10.891000+00:00,Year 4,2620,United Kingdom,University College London (UCL),,...,0.0,joy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,supporting
1,1,4,2021-03-25 12:14:48.834000+00:00,1542,2019-06-01 20:00:49.151000+00:00,Year 5,2620,United Kingdom,University College London (UCL),,...,0.0,fear,0.0,0.0,1.0,0.0,0.0,0.0,0.0,asking a question
2,2,5,2021-03-25 12:17:29.904000+00:00,1542,2019-06-01 20:00:49.151000+00:00,Year 5,2620,United Kingdom,University College London (UCL),,...,1.0,?puzzled,0.0,0.0,1.0,0.0,0.0,0.0,0.0,asking a question


In [14]:
df_rlhf = df.loc[df['Puzzled'].notna()].copy()

In [15]:
import numpy as np
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'Puzzled']
df_rlhf['surprise'] = np.minimum(df_rlhf['surprise'] + df_rlhf['Puzzled'], 1)
df_rlhf['emotion'] = df_rlhf['emotion'].replace({'?puzzled': 'surprise'})

In [16]:
drop_cols = [
    'Unnamed: 0', 
    'Puzzled', 
    'chapter_explanation_update',
    'question_update',
    'question_explanation_update',
    'qc_explanation_update'
]
df_rlhf = df_rlhf.drop(drop_cols, axis=1)

In [17]:
rename_cols = ['Frustration', 'Humour']

df_rlhf = df_rlhf.rename(columns={k: k.lower() for k in rename_cols})
df_rlhf.loc[:, 'theme'] = df_rlhf.loc[:, 'theme'].str.lower()

In [18]:
df_rlhf.head(3)

Unnamed: 0,id,createdAt,userId,userCreatedAt,classYear,universityId,country,universityName,parentId,questionId,...,surprise,emotion,educational,giving feedback,asking a question,insulting,supporting,humour,frustration,theme
0,2,2021-03-25 11:22:56.133000+00:00,1195,2019-03-18 18:41:10.891000+00:00,Year 4,2620,United Kingdom,University College London (UCL),,1170,...,0.0,joy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,supporting
1,4,2021-03-25 12:14:48.834000+00:00,1542,2019-06-01 20:00:49.151000+00:00,Year 5,2620,United Kingdom,University College London (UCL),,1090,...,0.0,fear,0.0,0.0,1.0,0.0,0.0,0.0,0.0,asking a question
2,5,2021-03-25 12:17:29.904000+00:00,1542,2019-06-01 20:00:49.151000+00:00,Year 5,2620,United Kingdom,University College London (UCL),,2603,...,1.0,surprise,0.0,0.0,1.0,0.0,0.0,0.0,0.0,asking a question


In [19]:
tone_labels = ['positive', 'negative', 'neutral']
emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
theme_labels = ['educational', 'giving feedback', 'asking a question', 'insulting', 'supporting', 'humour', 'frustration']

df_rlhf[tone_labels + emotion_labels + theme_labels] = df_rlhf[tone_labels + emotion_labels + theme_labels].astype(int)
df_rlhf.dtypes

id                     int64
createdAt             object
userId                 int64
userCreatedAt         object
classYear             object
universityId           int64
country               object
universityName        object
parentId             float64
questionId             int64
comment               object
review               float64
negative               int64
neutral                int64
positive               int64
tone                  object
sadness                int64
joy                    int64
love                   int64
anger                  int64
fear                   int64
surprise               int64
emotion               object
educational            int64
giving feedback        int64
asking a question      int64
insulting              int64
supporting             int64
humour                 int64
frustration            int64
theme                 object
dtype: object

In [20]:
df_rlhf.loc[:, ['tone', 'emotion', 'theme']].describe()

Unnamed: 0,tone,emotion,theme
count,150,150,150
unique,3,6,7
top,neutral,anger,asking a question
freq,99,55,55


In [21]:
rlhf_jsonl = 'data/comments_rlhf.jsonl'
df_rlhf.to_json(rlhf_jsonl, orient="records", lines=True)

In [22]:
from datasets import load_dataset

data_files = {
    'train': rlhf_jsonl
}

ds = load_dataset("json", data_files=data_files)
ds2 = ds['train'].train_test_split(test_size=0.2)
ds3 = ds2['test'].train_test_split(test_size=0.5)
ds['train'] = ds2['train']
ds['test'] = ds3['train']
ds['validate'] = ds3['test']
ds

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'createdAt', 'userId', 'userCreatedAt', 'classYear', 'universityId', 'country', 'universityName', 'parentId', 'questionId', 'comment', 'review', 'negative', 'neutral', 'positive', 'tone', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'emotion', 'educational', 'giving feedback', 'asking a question', 'insulting', 'supporting', 'humour', 'frustration', 'theme'],
        num_rows: 120
    })
    test: Dataset({
        features: ['id', 'createdAt', 'userId', 'userCreatedAt', 'classYear', 'universityId', 'country', 'universityName', 'parentId', 'questionId', 'comment', 'review', 'negative', 'neutral', 'positive', 'tone', 'sadness', 'joy', 'love', 'anger', 'fear', 'surprise', 'emotion', 'educational', 'giving feedback', 'asking a question', 'insulting', 'supporting', 'humour', 'frustration', 'theme'],
        num_rows: 15
    })
    validate: Dataset({
        features: ['id', 'createdAt', 'userId', 'userCreatedAt', 'classY

In [23]:
ds.push_to_hub('quesmed/comment_sentiment', private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]