In [1]:
# reload on change
%load_ext autoreload
%autoreload 2

import os
import json
import logging
import argparse
import time
from functools import cache
import statistics

import numpy as np
import pandas as pd
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset
from transformers import AutoTokenizer

from charm.model.model import XLMRClassificationPlusTripletLoss
from charm.model.utils import get_data
from charm.loaders.ldc_data import load_ldc_data
from charm.model.average_precision import calculate_average_precision, filter_system_preds

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
# develop function to find best model
predictions_dir = '/mnt/swordfish-pool2/ccu/predictions'
modality='text'
model_dirs = os.listdir(predictions_dir)
# load val_results.json if present
val_results = {}
for model_dir in model_dirs:
    val_results_filepath = os.path.join(predictions_dir, model_dir, 'val_results.json')
    if os.path.exists(val_results_filepath):
        with open(val_results_filepath, 'r') as f:
            val_results[model_dir] = json.load(f)
df = pd.DataFrame.from_dict(val_results, orient='index')#.reset_index().rename(columns={'index': 'model'})

df = df.stack().to_frame().rename(columns={0: 'val_results'}).reset_index().rename(columns={'level_0':'model', 'level_1': 'filtering'})
df = pd.concat([df.drop(columns=['val_results']), pd.json_normalize(df['val_results'])], axis=1)
# group by model and select filtering method with the highest text score
best_df = df.groupby('model').apply(lambda x: x.loc[x[modality].idxmax()]).reset_index(drop=True)
# go with change-point-medium-reweight for now
best_row = best_df.iloc[best_df['text'].idxmax()]
print(best_row)

model        change-point-social-orientation
filtering                             lowest
text                                0.242385
video                               0.040131
audio                               0.020504
Name: 9, dtype: object


In [2]:
model_name_or_file_path = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_file_path)

In [3]:
df = pd.read_csv(os.path.join(
        '/mnt/swordfish-pool2/ccu/transformed/change-point', 'change_point_social_orientation_train_val_test.csv'),
                     index_col=0)
# split into train, val, and test sets
train_df = df[df['split'] == 'train'].reset_index(drop=True).reset_index()
val_df = df[df['split'] == 'val'].reset_index(drop=True).reset_index()
test_df = df[df['split'] == 'test'].reset_index(drop=True).reset_index()
train_df = train_df.set_index(['index', 'file_id'])
val_df = val_df.set_index(['index', 'file_id'])
test_df = test_df.set_index(['index', 'file_id'])

  df = pd.read_csv(os.path.join(


In [6]:
# start = time.perf_counter()
# idx_file_id_map = {idx: file_id for idx, file_id in train_df.index}
# end = time.perf_counter()
# print(f'idx_file_id_map took {end - start} seconds to create')

In [7]:
# # for each file, get start and end indices
# start = time.perf_counter()
# file_id_start_end_df = {}
# for file_id in train_df.index.get_level_values('file_id').unique():
#     file_df = train_df.xs(file_id, level=1, drop_level=False)
#     first_idx = file_df.iloc[0].name[0]
#     last_idx = file_df.iloc[-1].name[0]
#     file_id_start_end_df[file_id] = {'first_idx': first_idx, 'last_idx': last_idx, 'df': file_df}
# end = time.perf_counter()
# print(f'file_id_start_end took {end - start} seconds to create')

In [8]:
# class ChangePointDataset(Dataset):
#     """Pretokenizes the text and combines window size utterances into one
#     sample, adding special tokens, as needed, when generating the example.
#     """
#     def __init__(self,
#                  df,
#                  tokenizer,
#                  window_size=4,
#                  impact_scalar=False,
#                  social_orientation=False,
#                  profile=False):
#         self.df = df
#         self.tokenizer = tokenizer
#         self.max_len = tokenizer.model_max_length
#         self.window_size = window_size
#         self.impact_scalar = impact_scalar
#         self.social_orientation = social_orientation
#         self.profile = profile
        
#         # pretokenize the text
#         # TODO: move over to an apache beam pipeline
#         # though there's not really an easy way to do this without replicating
#         # the data many times
#         # TODO: add special tokens to the text
#         # TODO: use ground truth social orientation labels
#         if social_orientation:
#             # e.g. 还是挣那么多钱 [Arrogant-Calculating]
#             self.df['text_final'] = self.df['text'] + ' ' + self.df[
#                 'social_orientation_preds'].apply(lambda x: f'[{x}]')
#         else:
#             self.df['text_final'] = self.df['text']

#         self.df['input_ids'] = self.tokenizer(
#             self.df['text_final'].values.tolist(),
#             add_special_tokens=False,
#             max_length=self.max_len,
#             truncation=True,
#             return_attention_mask=False)['input_ids']

#         # build a map from index to file_id
#         self.idx_file_id_map = {idx: file_id for idx, file_id in self.df.index}
#         self.file_id_start_end_df = {}
#         for file_id in self.df.index.get_level_values('file_id').unique():
#             file_df = self.df.xs(file_id, level=1, drop_level=False)
#             first_idx = file_df.iloc[0].name[0]
#             last_idx = file_df.iloc[-1].name[0]
#             self.file_id_start_end_df[file_id] = (first_idx, last_idx, file_df)
        

#     def _get_tokens(self, input_id_list):
#         tokens = [self.tokenizer.cls_token_id]
#         for idx, utterance in enumerate(input_id_list):
#             # add a sep token between utterances
#             if idx > 0:
#                 tokens.append(self.tokenizer.sep_token_id)
#             tokens.extend(utterance)
#             tokens.append(self.tokenizer.eos_token_id)

#         # if the sequence is too long, truncate half from the beginning and half from the end
#         # TODO: with this, you get the occasial sequence that starts with a sep token
#         if len(tokens) > self.max_len:
#             overage = len(tokens) - self.max_len
#             tokens = tokens[((overage // 2) + 2):-((overage // 2) + 2)]
#             # add the cls and eos tokens back
#             tokens = [self.tokenizer.cls_token_id
#                       ] + tokens + [self.tokenizer.eos_token_id]
#         return tokens

#     def __len__(self):
#         # length is the number of examples that can be generated per filename
#         # times the number of filenames
#         return len(self.df)

#     @cache
#     def __getitem__(self, idx):
#         # TODO: speed this up somehow
#         get_data_start = time.perf_counter()
#         file_id = self.idx_file_id_map[idx]
#         first_idx, last_idx, file_df = self.file_id_start_end_df[file_id]
#         # turns out that using iloc is faster than using loc
#         # translate the idx to a row number
#         idx_row_num = idx - first_idx
#         # get the start and end indices for the window
#         start_idx = max(0, idx_row_num - self.window_size)
#         end_idx = min(last_idx - first_idx, idx_row_num + (self.window_size - 1))
#         get_data_end = time.perf_counter()
#         get_data_duration = get_data_end - get_data_start

#         get_slice_start = time.perf_counter()
#         # .loc[start_idx:end_idx] is inclusive
#         utterances = file_df.iloc[start_idx:end_idx+1]
#         get_slice_end = time.perf_counter()
#         get_slice_duration = get_slice_end - get_slice_start

#         # print(f'utterances are {utterances}')
#         convert_start = time.perf_counter()
#         input_id_list = utterances['input_ids'].values.tolist()
#         convert_end = time.perf_counter()
#         convert_duration = convert_end - convert_start

#         get_tokens_start = time.perf_counter()
#         tokens = self._get_tokens(input_id_list)
#         get_tokens_end = time.perf_counter()
#         get_tokens_duration = get_tokens_end - get_tokens_start
#         # print(f'tokens is {tokens}')

#         remainder_start = time.perf_counter()
#         # label should be the max label in the window (i.e. greedily label change points)
#         # i.e. if any of the utterances in the window are change points, then the window is a change point
#         label = utterances['labels'].max()
#         # print(f'label is {label}')
#         # if nan, then set to 0
#         if np.isnan(label):
#             label = 0
#         label = int(label)

#         # add the impact scalar if needed
#         if self.impact_scalar:
#             # get the min impact scalar in the window among the impact scalars that are not 0
#             # if impact scalar is not set it will NaN. Min will ignore NaNs if there is a non-NaN value
#             impact_scalar = utterances['impact_scalar'].min()
#             if np.isnan(impact_scalar):
#                 impact_scalar = 0.0
#             return {
#                 'input_ids': tokens,
#                 'label': label,
#                 'impact_scalar': impact_scalar
#             }
#         remainder_end = time.perf_counter()
#         remainder_duration = remainder_end - remainder_start
#         if self.profile:
#             time_dict = {'get_data_duration': get_data_duration, 'get_slice_duration': get_slice_duration, 'convert_duration': convert_duration, 'get_tokens_duration': get_tokens_duration, 'remainder_duration': remainder_duration}
#             return {'input_ids': tokens, 'label': label}, time_dict
#         return {'input_ids': tokens, 'label': label}

In [9]:
# train_dataset = ChangePointDataset(train_df, tokenizer, window_size=10, social_orientation=False, impact_scalar=False, profile=True)

In [10]:
# start = time.perf_counter()
# time_dict = {}
# for i in range(10000):
#     item, times = train_dataset[i]
#     for k, v in times.items():
#         if k not in time_dict:
#             time_dict[k] = 0
#         time_dict[k] += v
# end = time.perf_counter()
# print(f'elapsed time is {end - start}')
# print(f'time_dict is {time_dict}')

In [11]:
# train_dataset[395][0]['label']

In [12]:
class Predictor(object):
    def __init__(self, model_dir):
        self.model_dir = model_dir
        # self.model = model # just a blueprint
        # self.tokenizer = tokenizer
        # self.device = args.device
    
    def _get_latest_checkpoint(self):
        # get the last checkpoint
        checkpoints = [
            f for f in os.listdir(self.args.model_dir) if 'checkpoint' in f
            and os.path.isdir(os.path.join(self.args.model_dir, f))
        ]
        checkpoints = sorted(checkpoints, key=lambda x: int(x.split('-')[1]))
        checkpoint = None
        if len(checkpoints) > 0:
            checkpoint = checkpoints[-1]
        return checkpoint
    
    def load_config(self, checkpoint=None):
        """Loads the model from disk."""
        # if checkpoint is None, load best based on best_checkpoint.txt
        if checkpoint is None:
            with open(os.path.join(self.model_dir, 'best_checkpoint.txt'),
                        'r') as f:
                checkpoint = f.read()
        elif checkpoint == 'last':
            checkpoint = self._get_latest_checkpoint()
            checkpoint = os.path.join(self.model_dir, checkpoint)
        else:
            checkpoint = os.path.join(self.model_dir, checkpoint)
        
        self.checkpoint = checkpoint

        # load trainer state
        with open(os.path.join(checkpoint, 'trainer_state.json'), 'r') as f:
            trainer_state = json.load(f)
            self.global_step = trainer_state['global_step']
            self.epoch = trainer_state['epoch']
            self.metrics = trainer_state['metrics']
            self.wandb_run_id = trainer_state['wandb_run_id']
            self.args = argparse.Namespace(**trainer_state['args'])
        
    def load_model(self, model):
        self.model = model
        # load model
        # define device map so we load on rank 0 and broadcast to other ranks
        # https://discuss.pytorch.org/t/checkpoint-in-multi-gpu/97852/11
        map_location = None
        # TODO: will need to adjust args to support this properly
        if self.args.distributed:
            map_location = f'cuda:{self.args.local_rank}'
            self.model.load_state_dict(
                torch.load(os.path.join(self.checkpoint, 'model.pt'),
                            map_location=map_location))
            self.model.to(self.args.device)
            logging.info(
                f'Model device {self.model.device} on rank {self.args.local_rank}'
            )
            self.model = DDP(
                self.model,
                device_ids=[self.args.device],
                output_device=self.args.device,
            )
            # dist.barrier()
        else:
            self.model.load_state_dict(
                torch.load(os.path.join(self.checkpoint, 'model.pt'),
                            map_location=map_location))
            self.model.to(self.args.device)
        
        # put the model in eval mode
        self.model.eval()
        logging.info(f'Loaded model on {self.args.device}...')
        # self.optimizer.load_state_dict(
        #     torch.load(os.path.join(save_dir, 'optimizer.pt'),
        #                 map_location=map_location))
        # logging.info(f'Loaded optimizer on {self.args.device}...')
        # self.lr_scheduler.load_state_dict(
        #     torch.load(os.path.join(save_dir, 'lr_scheduler.pt'),
        #                 map_location=map_location))

In [13]:
predictor = Predictor(model_dir='/mnt/swordfish-pool2/ccu/models/change-point-class-reweight')
predictor.load_config(checkpoint=None)

In [14]:
model = XLMRClassificationPlusTripletLoss.from_pretrained(
            'xlm-roberta-base',
            num_labels=len(predictor.args.id2label),
            id2label=predictor.args.id2label,
            label2id=predictor.args.label2id)
# this will enable things like triplet loss, impact scalar, social orientation, and class weighting
model.add_args(predictor.args)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRClassificationPlusTripletLoss: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRClassificationPlusTripletLoss from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRClassificationPlusTripletLoss from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRClassificationPlusTripletLoss were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weig

In [15]:
predictor.args.device = 'cuda:3'

In [16]:
predictor.load_model(model)

In [17]:
# get dataloaders
train_loader, val_loader, test_loader, args = get_data(predictor.args)

  df = pd.read_csv(os.path.join(


In [18]:
batch = next(iter(val_loader))

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
# make model predictions
from tqdm import tqdm

with torch.no_grad():
    predictions = []
    llr = []
    labels = []
    for batch in tqdm(val_loader):
        labels.extend(batch['labels'])
        # move data to device
        batch = {k: v.to(predictor.args.device) for k, v in batch.items()}

        outputs = predictor.model(**batch)
        logits = outputs[1]
        llr.extend((logits[:, 1] - logits[:, 0]).tolist())
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.tolist())

  0%|          | 0/484 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 484/484 [02:51<00:00,  2.82it/s]


In [20]:
preds_df = pd.DataFrame({'prediction': predictions, 'llr': llr})

In [21]:
# prep predictions, join back to val_df
val_df = pd.concat((val_df.reset_index(), preds_df), axis=1)

In [22]:
# get the change points as follows
# filter for utterances where the prediction is 1
# get the midpoint of that utterance (or start or end?) as the timestamp
change_point_df = val_df[val_df['prediction'] == 1]

In [23]:
val_df.columns

Index(['index', 'file_id', 'split', 'anno_start', 'anno_end', 'url',
       'status_in_corpora', 'data_type', 'release', 'processed', 'start',
       'end', 'text', 'avg_logprob', 'no_speech_prob', 'audio_files',
       'video_frames', 'timestamp', 'impact_scalar', 'comment', 'annotator',
       'labels', 'input_ids', 'social_orientation_preds', 'social_orientation',
       'prediction', 'llr'],
      dtype='object')

In [24]:
# scoring code is expecting, the following format for predictions
#       [
#           {'file_id': 'M010015BY', 'timestamp': 1160.2, 'type': 'audio', 'llr': 1.5},
#           {'file_id': 'M010015BY', 'timestamp': 1287.67, 'type': 'audio', 'llr': 1.5},
#           {'file_id': 'M010029SP', 'timestamp': 288, 'type': 'text', 'llr': 1.5},
#           {'file_id': 'M010005QD', 'timestamp': 90.2, 'llr': 1.5, 'type': 'video'},
#           {'file_id': 'M010019QD', 'timestamp': 190, 'llr': 1.5, 'type': 'text'}
#       ]
change_point_df = change_point_df[['file_id', 'start', 'data_type', 'llr']].rename(columns={'start': 'timestamp', 'data_type': 'type'})
hyps = change_point_df.to_dict(orient='records')

In [25]:
# get reference data with the following expected format
#       [
#           {'file_id': 'M010015BY', 'timestamp': 1160.2, 'type': 'audio', 'impact_scalar': 4},
#           {'file_id': 'M010015BY', 'timestamp': 1287.6, 'type': 'audio', 'impact_scalar': 2},
#           {'file_id': 'M010029SP', 'timestamp': 288.0, 'type': 'text', 'impact_scalar': 1},
#           {'file_id': 'M010005QD', 'timestamp': 90.2, 'type': 'video', 'impact_scalar': 5},
#           {'file_id': 'M010019QD', 'timestamp': 90, 'type': 'text', 'impact_scalar': 5}
#       ]

# load the reference data to be sure we've got all the data points
data = load_ldc_data(False, True)
val_data = {k: v for k, v in data.items() if data[k]['split'] == 'val'}
refs = []
for file_id in val_data:
    for changepoint in val_data[file_id]['changepoints']:
        refs.append({'file_id': file_id, 'timestamp': changepoint['timestamp'], 'type': val_data[file_id]['data_type'], 'impact_scalar': changepoint['impact_scalar']})

In [27]:
# filtering options {'none', 'highest', 'lowest', 'most_similar'}
start = time.perf_counter()
results = {}
mean_scores = []
for filtering in ['none', 'highest', 'lowest', 'most_similar']:
    print(f'filtering with {filtering}')
    results[filtering] = calculate_average_precision(refs,
                                hyps,
                                text_char_threshold=100,
                                time_sec_threshold=10,
                                filtering=filtering)
    mean_scores.append((filtering, statistics.harmonic_mean(results[filtering].values())))
end = time.perf_counter()
print(f'elapsed time is {end - start}')

elapsed time is 193.59772488474846


In [32]:
# select the filtering procedure that gives the best average precision across modalities with a greater
mean_scores = []
for filtering_approach in results:
    mean_scores.append((filtering_approach, statistics.harmonic_mean(results[filtering_approach].values())))

In [36]:
# get the best filtering approach
filtering = max(mean_scores, key=lambda x: x[1])[0]

In [37]:
# dump predictions to csv with the following columns
# file_id, timestamp, llr
# filter predictions according to the best procedure
final_preds = filter_system_preds(hyps, text_char_threshold=100, time_sec_threshold=10, filtering=filtering)

In [40]:
final_preds_df = pd.DataFrame.from_dict(final_preds)[['file_id', 'timestamp', 'llr']]
os.path.basename(predictor.model_dir)
# save to predictions folder with the same name as the model_dir
preds_dir = f'/mnt/swordfish-pool2/ccu/predictions/{os.path.basename(predictor.model_dir)}'
os.makedirs(preds_dir, exist_ok=True)
final_preds_df.to_csv(os.path.join(preds_dir, 'val.csv'), index=False)

In [10]:
import numpy as np
from sklearn.metrics import classification_report

In [11]:
predictor.args.id2label

{1: 'Change Point', 0: 'No Change Point'}

In [12]:
print(classification_report(labels, predictions, target_names=['No Change Point', 'Change Point'], labels=[0, 1], zero_division=0))

                 precision    recall  f1-score   support

No Change Point       0.91      1.00      0.95     56499
   Change Point       0.00      0.00      0.00      5433

       accuracy                           0.91     61932
      macro avg       0.46      0.50      0.48     61932
   weighted avg       0.83      0.91      0.87     61932

