In [1]:
import argparse
import time
import os
import json
import logging

import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import AutoTokenizer

from ldc_data import load_ldc_data
from charm.model.args import parse_args

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_ldc_data(False, True)

In [3]:
df = pd.DataFrame.from_dict(data, orient='index')

In [4]:
# filter out any unprocessed data
df = df[df['processed'] == True]

## Explode labeled change point data

In [5]:
change_point_df = df[['file_id', 'changepoints']]
change_point_df = change_point_df.explode(column='changepoints')
change_point_df = change_point_df.reset_index(drop=True)
change_point_df = pd.concat((change_point_df, pd.json_normalize(change_point_df['changepoints'])), axis=1)
change_point_df = change_point_df.drop(columns=['changepoints'])

In [6]:
change_point_df.head()

Unnamed: 0,file_id,timestamp,impact_scalar,comment,annotator
0,M0100053I,399.0,2.0,Pre-change: Speakers were exchanging informati...,212.0
1,M0100053J,,,,
2,M0100053L,,,,
3,M0100053R,,,,
4,M01000545,,,,


## Explode utterance data

In [7]:
utterance_df = df.drop(columns=['changepoints'])
utterance_df = utterance_df.explode('utterances')
utterance_df = utterance_df.reset_index(drop=True)
utterance_df = pd.concat((utterance_df, pd.json_normalize(utterance_df['utterances'])), axis=1)
utterance_df = utterance_df.drop(columns=['utterances'])

In [8]:
utterance_df.columns = ['file_id', 'split', 'anno_start', 'anno_end', 'url', 'status_in_corpora',
       'data_type', 'release', 'processed', 'start', 'end', 'text',
       'avg_logprob', 'no_speech_prob', 'audio_files', 'video_frames']

In [9]:
# for each file id, filter out utterances whose end time is less than anno_start and whose start time is greater than anno_end
utterance_df = utterance_df[utterance_df['end'] >= utterance_df['anno_start']]
utterance_df = utterance_df[utterance_df['start'] <= utterance_df['anno_end']]

In [10]:
# merge change_point_df and utterance_df on file_id and then filter down to rows where the start time of the utterance is less than the change point and the end time of the utterance is greater than the change point
merged_df = pd.merge(change_point_df, utterance_df, on='file_id')
merged_df = merged_df[merged_df['start'] <= merged_df['timestamp']]
merged_df = merged_df[merged_df['end'] >= merged_df['timestamp']]

In [11]:
# this data frame contains all the valid change points
# left join back into utterance_df on file_id, start, and end
merged_df = merged_df[['file_id', 'start', 'end', 'timestamp', 'impact_scalar', 'comment', 'annotator']]
label_df = pd.merge(utterance_df, merged_df, on=['file_id', 'start', 'end'], how='left')

In [12]:
# confirm we have all the expected change points
assert label_df['timestamp'].notna().sum() == len(merged_df)

In [13]:
# create a labels column where 1 is a change point and 0 is not
label_df['labels'] = label_df['timestamp'].notna().astype(int)

In [14]:
# sort values by file_id and start to be safe
label_df = label_df.sort_values(by=['file_id', 'start'], ascending=True)
label_df = label_df.reset_index(drop=True)
# give ourselves an index to work with
label_df = label_df.reset_index()
# try to put an index on both the index and the file_id to speed up the slicing
label_df = label_df.set_index(['index', 'file_id'])

In [18]:
# alternatively, we can just retrieve the data point, determine which file_id it belongs to, slice the data frame, and then look at the [-k:idx+1] slice of the df
idx = 4
k = 3
file_id = label_df.xs(idx, level=0, drop_level=True).index[0]
start_idx = max(0, idx - k)
# .loc[start_idx:idx] is inclusive
utterances = label_df.xs(file_id, level=1, drop_level=False).loc[start_idx:idx]

In [19]:
model_name_or_file_path = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_file_path)

In [20]:
expected = tokenizer(
            [label_df['text'].values.tolist()[:2]],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            return_attention_mask=False)['input_ids']

In [21]:
# the pattern the model is using when concatenating multiple utterances is to
# start the sequence with the cls_token_id, then finish an utterance with the eos_token_id,
# then include a sep_token_id between utterances
# and finally include an eos_token_id at the end of the sequence
tokenizer.cls_token_id
tokenizer.eos_token_id
tokenizer.sep_token_id

2

In [22]:
input_ids = tokenizer(
            label_df['text'].values.tolist()[:2],
            add_special_tokens=False,
            max_length=512,
            truncation=True,
            return_attention_mask=False)['input_ids']

In [23]:
tokens = [tokenizer.cls_token_id]
for idx, utterance in enumerate(input_ids):
    # add a sep token between utterances
    if idx > 0:
        tokens.append(tokenizer.sep_token_id)
    tokens.extend(utterance)
    tokens.append(tokenizer.eos_token_id)

In [24]:
assert tokens == expected[0]

In [25]:
class ChangePointDataset(Dataset):
    """Pretokenizes the text and combines window size utterances into one
    sample, adding special tokens, as needed, when generating the example.
    """

    def __init__(self, df, tokenizer, window_size=3, stride=1):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = tokenizer.model_max_length
        self.window_size = window_size
        self.stride = stride

        # pretokenize the text
        # TODO: move over to an apache beam pipeline
        # though there's not really an easy way to do this without replicating
        # the data many times
        self.df['input_ids'] = self.tokenizer(
            self.df['text'].values.tolist(),
            add_special_tokens=False,
            max_length=self.max_len,
            truncation=True,
            return_attention_mask=False)['input_ids']

    def _get_tokens(self, input_id_list):
        tokens = [self.tokenizer.cls_token_id]
        for idx, utterance in enumerate(input_id_list):
            # add a sep token between utterances
            if idx > 0:
                tokens.append(self.tokenizer.sep_token_id)
            tokens.extend(utterance)
            tokens.append(self.tokenizer.eos_token_id)
        
        # if the sequence is too long, truncate it starting from the beginning
        # TODO: with this, you get the occasial sequence that starts with a sep token
        if len(tokens) > self.max_len:
            tokens = tokens[-(self.max_len - 1):]
            # add the cls token back
            tokens = [self.tokenizer.cls_token_id] + tokens
        return tokens
    
    def __len__(self):
        # length is the number of examples that can be generated per filename
        # times the number of filenames
        return len(self.df)

    def __getitem__(self, idx):
        # TODO: speed this up somehow
        file_id = self.df.xs(idx, level=0, drop_level=True).index[0]
        start_idx = max(0, idx - self.window_size)
        # .loc[start_idx:idx] is inclusive
        utterances = self.df.xs(file_id, level=1, drop_level=False).loc[start_idx:idx]

        input_id_list = utterances['input_ids'].values.tolist()
        tokens = self._get_tokens(input_id_list)

        # label should be the max label in the window (i.e. greedily label change points)
        # i.e. if any of the utterances in the window are change points, then the window is a change point
        label = utterances['labels'].max()
        return {'input_ids': tokens, 'label': label}

In [26]:
change_point_dataset = ChangePointDataset(label_df, tokenizer, window_size=3, stride=1)

In [73]:
# iterate over the dataset and confirm that the tokens/labels are what we expect
start = time.perf_counter()
data_points = []
for i in range(len(change_point_dataset)):
    data_points.append(change_point_dataset[i])
    # if i > 100:
    #     break
end = time.perf_counter()
print(f'elapsed time: {end - start: .2f} seconds')

elapsed time:  482.88 seconds


In [28]:
# approx time (minutes) to iterate over the dataset
(446) / 60

7.433333333333334

## Descriptive statistics about the dataset
- what's the min, max, median, mean length of the utterance
- what truncation strategy should we employ (random sampling of the longest utterance? cut off the first couple utterances?)

In [74]:
assert len(data_points) == len(label_df)
label_df['encoded_data'] = data_points
temp_df = pd.json_normalize(label_df['encoded_data'])
temp_df.index = label_df.index
label_df[['input_ids_final', 'label_final']] = temp_df[['input_ids', 'label']]
check_labels_df = label_df[label_df['labels'] == 1][['labels', 'label_final']]
check_labels_df['label_match'] = check_labels_df['labels'] == check_labels_df['label_final']
check_labels_df['label_match'].value_counts()

True    2341
Name: label_match, dtype: int64

In [75]:
# get descriptive stats on the lengths of the input_ids
label_df['input_ids_len'] = label_df['input_ids_final'].apply(lambda x: len(x))
label_df['input_ids_len'].describe()

count    430897.000000
mean         71.769010
std          14.832004
min          13.000000
25%          66.000000
50%          70.000000
75%          75.000000
max         512.000000
Name: input_ids_len, dtype: float64

In [76]:
# take length of the utterance, add 2 for the cls and eos tokens, then sum windows within each file_id
label_df['input_ids_len'] = label_df['input_ids'].apply(lambda x: len(x) + 2)
label_df.groupby('file_id')['input_ids_len'].rolling(3, min_periods=1).sum().reset_index(drop=True).describe()

count    430897.000000
mean         54.067165
std          11.625714
min          13.000000
25%          49.000000
50%          53.000000
75%          57.000000
max         722.000000
Name: input_ids_len, dtype: float64

In [77]:
label_df['text'].apply(len).describe()

count    430897.000000
mean          8.909350
std           8.307804
min           1.000000
25%           5.000000
50%           8.000000
75%          11.000000
max        1061.000000
Name: text, dtype: float64

In [117]:
# determine the class weights on the fly
k = 4
class_counts = [0, 0]
for file_id in label_df[label_df['split'] == 'train'].index.get_level_values(1):
    temp_df = label_df.xs(file_id, level=1)    
    
    for i in range(len(temp_df)):
        start_idx = max(0, i - (k-1))
        utterances = temp_df.loc[start_idx:i]
        label = utterances['labels'].max()
        # if nan
        if np.isnan(label):
            label = 0
        class_counts[int(label)] += 1

KeyboardInterrupt: 

In [116]:
class_counts

nan

In [84]:
# class_counts = label_df[label_df['split'] == 'train']['label_final'].value_counts().values

In [85]:
# [0, 1]
class_counts

array([301598,   6158])

In [87]:
# given this imbalance, we should probably use a weighted loss function
# applying the following function: n_samples / (n_classes * np.bincount(y))
loss_weights = class_counts.sum() / (len(class_counts) * class_counts)

In [88]:
loss_weights

array([ 0.51020895, 24.98830789])

## Pull in social orientation tags

In [31]:
class Predictor(object):
    def __init__(self, model_dir):
        self.model_dir = model_dir
        # self.model = model # just a blueprint
        # self.tokenizer = tokenizer
        # self.device = args.device
    
    def _get_latest_checkpoint(self):
        # get the last checkpoint
        checkpoints = [
            f for f in os.listdir(self.args.model_dir) if 'checkpoint' in f
            and os.path.isdir(os.path.join(self.args.model_dir, f))
        ]
        checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[1]))
        checkpoint = None
        if len(checkpoints) > 0:
            checkpoint = checkpoints[-1]
        return checkpoint
    
    def load_config(self, checkpoint=None):
        """Loads the model from disk."""
        # if checkpoint is None, load best based on best_checkpoint.txt
        if checkpoint is None:
            with open(os.path.join(self.model_dir, 'best_checkpoint.txt'),
                        'r') as f:
                checkpoint = f.read()
        elif checkpoint == 'last':
            checkpoint = self._get_latest_checkpoint()
            checkpoint = os.path.join(self.model_dir, checkpoint)
        else:
            checkpoint = os.path.join(self.model_dir, checkpoint)
        
        self.checkpoint = checkpoint

        # load trainer state
        with open(os.path.join(checkpoint, 'trainer_state.json'), 'r') as f:
            trainer_state = json.load(f)
            self.global_step = trainer_state['global_step']
            self.epoch = trainer_state['epoch']
            self.metrics = trainer_state['metrics']
            self.wandb_run_id = trainer_state['wandb_run_id']
            self.args = argparse.Namespace(**trainer_state['args'])
        
    def load_model(self, model):
        self.model = model
        # load model
        # define device map so we load on rank 0 and broadcast to other ranks
        # https://discuss.pytorch.org/t/checkpoint-in-multi-gpu/97852/11
        map_location = None
        # TODO: will need to adjust args to support this properly
        if self.args.distributed:
            map_location = f'cuda:{self.args.local_rank}'
            self.model.load_state_dict(
                torch.load(os.path.join(self.checkpoint, 'model.pt'),
                            map_location=map_location))
            self.model.to(self.args.device)
            logging.info(
                f'Model device {self.model.device} on rank {self.args.local_rank}'
            )
            self.model = DDP(
                self.model,
                device_ids=[self.args.device],
                output_device=self.args.device,
            )
            # dist.barrier()
        else:
            self.model.load_state_dict(
                torch.load(os.path.join(self.checkpoint, 'model.pt'),
                            map_location=map_location))
            self.model.to(self.args.device)
        
        # put the model in eval mode
        self.model.eval()
        logging.info(f'Loaded model on {self.args.device}...')
        # self.optimizer.load_state_dict(
        #     torch.load(os.path.join(save_dir, 'optimizer.pt'),
        #                 map_location=map_location))
        # logging.info(f'Loaded optimizer on {self.args.device}...')
        # self.lr_scheduler.load_state_dict(
        #     torch.load(os.path.join(save_dir, 'lr_scheduler.pt'),
        #                 map_location=map_location))

In [32]:
# arg_list = [
#         '--batch-size', '64',
#         '--wandb-project', 'social-orientation',
#         '--log-level', 'INFO',
#         '--seed', '10',
#         '--data-dir', '/mnt/swordfish-pool2/ccu/transformed/circumplex',
#         '--model-dir', '/mnt/swordfish-pool2/ccu/models/xlm-roberta-base-pt']
# args = parse_args(arg_list)

In [118]:
predictor = Predictor(model_dir='/mnt/swordfish-pool2/ccu/models/xlm-roberta-base-pt')

In [34]:
predictor.load_config(checkpoint=None)

In [35]:
model = AutoModelForSequenceClassification.from_pretrained(
            predictor.args.model_name_or_path,
            num_labels=len(predictor.args.label2id),
            id2label=predictor.args.id2label,
            label2id=predictor.args.label2id)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [119]:
predictor.args.device = 'cuda:0'

AttributeError: 'Predictor' object has no attribute 'args'

In [37]:
predictor.load_model(model=model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [38]:
# create a dataloader from the label_df
from torch.utils.data import Dataset, DataLoader

class LabelDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return self.df[['input_ids']].iloc[idx].to_dict()

label_dataset = LabelDataset(label_df)
collate = DataCollatorWithPadding(tokenizer=tokenizer)
label_dataloader = DataLoader(label_dataset, batch_size=1024, num_workers=32, prefetch_factor=2, shuffle=False, collate_fn=collate)

In [39]:
# batch = next(iter(label_dataloader))

In [40]:
# make model predictions
from tqdm import tqdm

with torch.no_grad():
    predictions = []
    for batch in tqdm(label_dataloader):
        # move data to device
        batch = {k: v.to(predictor.args.device) for k, v in batch.items()}
        outputs = predictor.model(**batch)
        logits = outputs[1]
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.tolist())

  0%|          | 0/421 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with 

In [41]:
# add predictions to label_df
label_df['social_orientation_preds'] = predictions

In [42]:
# convert ids to ints
predictor.args.id2label = {int(k): v for k, v in predictor.args.id2label.items()}

In [43]:
label_df['social_orientation_preds'] = label_df['social_orientation_preds'].map(predictor.args.id2label)

In [44]:
# load labeled circumplex data to sanity check these results
# load the dataset
circumplex_df = pd.read_csv(os.path.join(predictor.args.data_dir, 'gpt_labels_r1_mini_eval_text.csv'))
circumplex_df.head()

Unnamed: 0,filename,@id,@start_char,@end_char,ORIGINAL_TEXT,@type,@begin_offset,@char_length,id,time,...,impact_scalar,comment,Utterance ID,Complete Line,Complete Line Length,line_len_cumsum,social_orientation,utterance_id,speaker_id,label_str
0,M01000EY0.ltf.xml,segment-0,0,11,你手机充300不够是吧？,message,0,14,m0000,2012-07-09 08:54:39 UTC,...,,,1,Speaker 1 (1): 你手机充300不够是吧？,20,20,Unassured-Submissive,1.0,1.0,Speaker 1 (1): Unassured-Submissive - The spea...
1,M01000EY0.ltf.xml,segment-1,14,20,115.51？,message,14,9,m0001,2012-07-09 08:56:50 UTC,...,,,2,Speaker 2 (2): 115.51？,12,32,Unassuming-Ingenuous,2.0,2.0,Speaker 2 (2): Unassuming-Ingenuous - The spea...
2,M01000EY0.ltf.xml,segment-2,23,27,我都糊涂了,message,23,7,m0002,2012-07-09 08:56:53 UTC,...,,,3,Speaker 2 (3): 我都糊涂了,16,48,Aloof-Introverted,3.0,2.0,Speaker 2 (3): Aloof-Introverted - The speaker...
3,M01000EY0.ltf.xml,segment-3,30,36,我打电话问问吧,message,30,9,m0003,2012-07-09 08:56:59 UTC,...,,,4,Speaker 2 (4): 我打电话问问吧,15,63,Unassuming-Ingenuous,4.0,2.0,Speaker 2 (4): Unassuming-Ingenuous - The spea...
4,M01000EY0.ltf.xml,segment-4,39,39,昂,message,39,3,m0004,2012-07-09 08:57:56 UTC,...,,,5,Speaker 1 (5): 昂,9,72,Unassured-Submissive,5.0,1.0,Speaker 1 (5): Unassured-Submissive - The spea...


In [45]:
circumplex_df['start'] = circumplex_df['@start_char'].astype(float)

In [46]:
circumplex_df.head()

Unnamed: 0,filename,@id,@start_char,@end_char,ORIGINAL_TEXT,@type,@begin_offset,@char_length,id,time,...,comment,Utterance ID,Complete Line,Complete Line Length,line_len_cumsum,social_orientation,utterance_id,speaker_id,label_str,start
0,M01000EY0.ltf.xml,segment-0,0,11,你手机充300不够是吧？,message,0,14,m0000,2012-07-09 08:54:39 UTC,...,,1,Speaker 1 (1): 你手机充300不够是吧？,20,20,Unassured-Submissive,1.0,1.0,Speaker 1 (1): Unassured-Submissive - The spea...,0.0
1,M01000EY0.ltf.xml,segment-1,14,20,115.51？,message,14,9,m0001,2012-07-09 08:56:50 UTC,...,,2,Speaker 2 (2): 115.51？,12,32,Unassuming-Ingenuous,2.0,2.0,Speaker 2 (2): Unassuming-Ingenuous - The spea...,14.0
2,M01000EY0.ltf.xml,segment-2,23,27,我都糊涂了,message,23,7,m0002,2012-07-09 08:56:53 UTC,...,,3,Speaker 2 (3): 我都糊涂了,16,48,Aloof-Introverted,3.0,2.0,Speaker 2 (3): Aloof-Introverted - The speaker...,23.0
3,M01000EY0.ltf.xml,segment-3,30,36,我打电话问问吧,message,30,9,m0003,2012-07-09 08:56:59 UTC,...,,4,Speaker 2 (4): 我打电话问问吧,15,63,Unassuming-Ingenuous,4.0,2.0,Speaker 2 (4): Unassuming-Ingenuous - The spea...,30.0
4,M01000EY0.ltf.xml,segment-4,39,39,昂,message,39,3,m0004,2012-07-09 08:57:56 UTC,...,,5,Speaker 1 (5): 昂,9,72,Unassured-Submissive,5.0,1.0,Speaker 1 (5): Unassured-Submissive - The spea...,39.0


In [47]:
# merge with the label_df on file_id and start
temp_df = pd.merge(label_df, circumplex_df[['file_id', 'start', 'social_orientation']], on=['file_id', 'start'], how='left')

In [48]:
assert len(temp_df) == len(label_df)

In [49]:
# reassign the label_df so we retain the ground truth social orientation labels
label_df = temp_df

In [50]:
# check the accuracy of the predictions, which is as expected
temp_df = temp_df.dropna(subset=['social_orientation'])
temp_df['social_orientation_match'] = temp_df['social_orientation_preds'] == temp_df['social_orientation']
temp_df['social_orientation_match'].sum() / len(temp_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['social_orientation_match'] = temp_df['social_orientation_preds'] == temp_df['social_orientation']


0.3907206455203116

In [51]:
# save label_df to disk
data_dir = '/mnt/swordfish-pool2/ccu/transformed/change-point'
os.makedirs(data_dir, exist_ok=True)
label_df.to_csv(os.path.join(data_dir, 'change_point_social_orientation_train_val_test.csv'), index=True)

In [63]:
# lost index when setting label_df = temp_df
label_df = label_df.reset_index(drop=False)
label_df = label_df.set_index(['index', 'file_id'])

In [64]:
label_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,split,anno_start,anno_end,url,status_in_corpora,data_type,release,processed,start,end,...,video_frames,timestamp,impact_scalar,comment,annotator,labels,input_ids,social_orientation_preds,social_orientation,text_final
index,file_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,M01000538,train,402.8,721.7,na,[(LDC2022E11_CCU_TA1_Mandarin_Chinese_Developm...,audio,LDC2022E18,True,402.0,404.0,...,[],,,,,0,"[6, 5169, 155474, 133334, 9940, 378, 7614, 330...",Arrogant-Calculating,,还是挣那么多钱 [Arrogant-Calculating]
1,M01000538,train,402.8,721.7,na,[(LDC2022E11_CCU_TA1_Mandarin_Chinese_Developm...,audio,LDC2022E18,True,404.0,406.0,...,[],,,,,0,"[6, 1036, 4, 631, 176923, 31183, 18350, 378, 7...",Warm-Agreeable,,"对,我这边挺好的 [Warm-Agreeable]"
2,M01000538,train,402.8,721.7,na,[(LDC2022E11_CCU_TA1_Mandarin_Chinese_Developm...,audio,LDC2022E18,True,406.0,407.0,...,[],,,,,0,"[6, 35168, 378, 106396, 66596, 214, 9, 116836,...",Unassuming-Ingenuous,,哦 [Unassuming-Ingenuous]
3,M01000538,train,402.8,721.7,na,[(LDC2022E11_CCU_TA1_Mandarin_Chinese_Developm...,audio,LDC2022E18,True,407.0,409.0,...,[],,,,,0,"[73675, 11973, 2391, 6147, 378, 106396, 66596,...",Unassuming-Ingenuous,,你注意点啊 [Unassuming-Ingenuous]
4,M01000538,train,402.8,721.7,na,[(LDC2022E11_CCU_TA1_Mandarin_Chinese_Developm...,audio,LDC2022E18,True,409.0,410.0,...,[],,,,,0,"[6, 7064, 2183, 9449, 22191, 5070, 4502, 378, ...",Unassuming-Ingenuous,,没事写封信吧 [Unassuming-Ingenuous]


In [52]:
# load the label_df to understand what the index looks like
reload_label_df = pd.read_csv(os.path.join(data_dir, 'change_point_social_orientation_train_val_test.csv'), index_col=[0, 1])

  reload_label_df = pd.read_csv(os.path.join(data_dir, 'change_point_social_orientation_train_val_test.csv'), index_col=[0, 1])


In [60]:
reload_label_df.iloc[-1:]

Unnamed: 0_level_0,Unnamed: 1_level_0,split,anno_start,anno_end,url,status_in_corpora,data_type,release,processed,start,end,...,audio_files,video_frames,timestamp,impact_scalar,comment,annotator,labels,input_ids,social_orientation_preds,social_orientation
Unnamed: 0_level_1,file_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
430896,M01005FAG,train,34.6,334.6,https://www.bilibili.com/video/BV1uU4y187TL,[('LDC2023E03_CCU_TA1_Mandarin_Chinese_Develop...,video,LDC2022E18,True,333.68,335.32,...,[],[],,,,,0,"[6, 3933, 57105, 243973, 107249]",Unassuming-Ingenuous,


## Revise the dataset class 
- optionally generate impact scalars
- optionally include social orientation information in utterances
- define the window to be forward and backward looking

In [65]:
label_df['text_final'] = label_df['text'] + ' ' + label_df['social_orientation_preds'].apply(lambda x: f'[{x}]')

In [66]:
label_df.loc[390:395]['impact_scalar'].min()

2.0

In [67]:
class ChangePointDataset(Dataset):
    """Pretokenizes the text and combines window size utterances into one
    sample, adding special tokens, as needed, when generating the example.
    """

    def __init__(self, df, tokenizer, window_size=3, impact_scalar=False, social_orientation=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = tokenizer.model_max_length
        self.window_size = window_size
        self.impact_scalar = impact_scalar
        self.social_orientation = social_orientation

        # pretokenize the text
        # TODO: move over to an apache beam pipeline
        # though there's not really an easy way to do this without replicating
        # the data many times
        # TODO: add special tokens to the text
        # TODO: use ground truth social orientation labels
        if social_orientation:
            # e.g. 还是挣那么多钱 [Arrogant-Calculating]
            self.df['text_final'] = self.df['text'] + ' ' + self.df['social_orientation_preds'].apply(lambda x: f'[{x}]')
        else:
            self.df['text_final'] = self.df['text']
        
        self.df['input_ids'] = self.tokenizer(
            self.df['text_final'].values.tolist(),
            add_special_tokens=False,
            max_length=self.max_len,
            truncation=True,
            return_attention_mask=False)['input_ids']

    def _get_tokens(self, input_id_list):
        tokens = [self.tokenizer.cls_token_id]
        for idx, utterance in enumerate(input_id_list):
            # add a sep token between utterances
            if idx > 0:
                tokens.append(self.tokenizer.sep_token_id)
            tokens.extend(utterance)
            tokens.append(self.tokenizer.eos_token_id)
        
        # if the sequence is too long, truncate half from the beginning and half from the end
        # TODO: with this, you get the occasial sequence that starts with a sep token
        if len(tokens) > self.max_len:
            overage = len(tokens) - self.max_len
            tokens = tokens[((overage//2) + 2):-((overage//2) + 2)]
            # add the cls and eos tokens back
            tokens = [self.tokenizer.cls_token_id] + tokens + [self.tokenizer.eos_token_id]
        return tokens
    
    def __len__(self):
        # length is the number of examples that can be generated per filename
        # times the number of filenames
        return len(self.df)

    def __getitem__(self, idx):
        # TODO: speed this up somehow
        file_id = self.df.xs(idx, level=0, drop_level=True).index[0]
        start_idx = max(0, idx - self.window_size)
        end_idx = min(len(self.df.xs(file_id, level=1, drop_level=False)), idx + (self.window_size - 1))
        # .loc[start_idx:end_idx] is inclusive
        utterances = self.df.xs(file_id, level=1, drop_level=False).loc[start_idx:end_idx]

        input_id_list = utterances['input_ids'].values.tolist()
        tokens = self._get_tokens(input_id_list)

        # label should be the max label in the window (i.e. greedily label change points)
        # i.e. if any of the utterances in the window are change points, then the window is a change point
        label = utterances['labels'].max()
        # if nan
        if np.isnan(label):
            label = 0
        label = int(label)

        # add the impact scalar if needed
        if self.impact_scalar:
            # get the min impact scalar in the window among the impact scalars that are not 0
            # if impact scalar is not set it will NaN. Min will ignore NaNs if there is a non-NaN value
            impact_scalar = utterances['impact_scalar'].min()
            if np.isnan(impact_scalar):
                impact_scalar = 0.0
            return {'input_ids': tokens, 'label': label, 'impact_scalar': impact_scalar}
        
        return {'input_ids': tokens, 'label': label}

In [68]:
change_point_dataset = ChangePointDataset(label_df, tokenizer, window_size=3, impact_scalar=True, social_orientation=True)

In [69]:
tokenizer.decode(change_point_dataset[0]['input_ids'])

'<s> 还是挣那么多钱 [Arrogant-Calculating]</s>'