In [None]:
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from tokenizers import Tokenizer
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
dataset_path = '../src/loaders'
sys.path.append(dataset_path)
from CRD3Dataset import CRD3Dataset

In [None]:
cfg_path = '../src/loaders/CRD3Dataset_all.yaml'
dataset = CRD3Dataset(cfg_path)

In [None]:
parsing_tokenizer = dataset.tokenizer
n_chunks = 0
min_utt = ''
max_utt = ''
utt_lens = []
min_summary = ''
max_summary = ''
summ_lens = []
turns_per_sum = []
current_fn = ''
chunks_per_file = []
turns_per_file = []
tokens_per_file = []

for fn, _, utts, summ in dataset.iter_chunk_w_filename():
    if len(summ) < 1:
        continue
    if fn != current_fn:
        current_fn = fn
        chunks_per_file.append(0)
        turns_per_file.append(0)
        tokens_per_file.append(0)
    n_chunks += 1
    chunks_per_file[-1] += 1
    turns_per_file[-1] += len(utts)
    turns_per_sum.append(len(utts))
    n_utt_tokens = len(parsing_tokenizer.encode((' '.join(utts))).ids)
    if len(utt_lens) < 1 or n_utt_tokens < min(utt_lens):
        min_utt = ' '.join(utts)
    elif n_utt_tokens > max(utt_lens):
        max_utt = ' '.join(utts)
    utt_lens.append(n_utt_tokens)
    tokens_per_file[-1] += n_utt_tokens

    n_summ_tokens = len(parsing_tokenizer.encode(summ).ids)
    if len(summ_lens) < 1 or  n_summ_tokens < min(summ_lens):
        min_summary = summ
    elif n_summ_tokens > max(summ_lens):
        max_summary = summ
    summ_lens.append(n_summ_tokens)

print('Chunks:', n_chunks)
print('Min utterances token length:', min(utt_lens))
print('Min utterances:')
print(min_utt)
print('Max utterances token length:', max(utt_lens))
print('Max utterances clip:')
print(max_utt[:200] + '...')
print('Min summary token length:', min(summ_lens))
print('Min summary:')
print(min_summary)
print('Max summary token length:', max(summ_lens))
print('Max summary clip:')
print(max_summary[:200] + '...')

In [None]:
plt.clf()
_ = plt.subplots(figsize=(10, 6))
bins = np.logspace(np.log10(min(utt_lens)), np.log10(max(utt_lens)))
plt.hist(utt_lens, label='Utt', color='dodgerblue', bins=bins)
plt.hist(summ_lens, label='Summary', color='darkorange', bins=bins)
plt.xscale('log')
plt.legend()
plt.title('Summary and document tokens')
plt.show()

plt.clf()
_ = plt.subplots(figsize=(10, 6))
ratios = [float(u) / float(s) for u, s in zip(utt_lens, summ_lens) if s != 0]
bins = np.logspace(np.log10(min(ratios)), np.log10(max(ratios)))
plt.hist(ratios, color='forestgreen', bins=bins)
plt.xscale('log')
plt.title('Document / Summary compression ratio')
plt.show()

In [None]:
np.percentile(utt_lens, 94), np.percentile(summ_lens, 94)

In [None]:
len(utt_lens), len(summ_lens)

In [None]:
upper_10_idx = np.argsort(utt_lens)[-int(len(utt_lens) / 10):]
upper_10_utt_lens = np.array(utt_lens)[upper_10_idx]
upper_10_summ_lens = np.array(summ_lens)[upper_10_idx]

plt.clf()
_ = plt.subplots(figsize=(10, 6))
bins = np.logspace(np.log10(min(upper_10_utt_lens.min(), upper_10_summ_lens.min())), np.log10(max(upper_10_utt_lens)))
plt.hist(upper_10_utt_lens, label='Utt', color='dodgerblue', bins=bins)
plt.hist(upper_10_summ_lens, label='Summary', color='darkorange', bins=bins)
plt.xscale('log')
plt.legend()
plt.title('Upper 10% document length summary and document tokens')
plt.show()

plt.clf()
_ = plt.subplots(figsize=(10, 6))
ratios = [float(u) / float(s) for u, s in zip(upper_10_utt_lens, upper_10_summ_lens) if s != 0]
bins = np.logspace(np.log10(min(ratios)), np.log10(max(ratios)))
plt.hist(ratios, color='forestgreen', bins=bins)
plt.xscale('log')
plt.title('Upper 10% document length compression ratio')
plt.show()

In [None]:
lower_90_idx = np.argsort(utt_lens)[:-int(len(utt_lens) / 10)]
lower_90_utt_lens = np.array(utt_lens)[lower_90_idx]
lower_90_summ_lens = np.array(summ_lens)[lower_90_idx]

plt.clf()
_ = plt.subplots(figsize=(10, 6))
bins = np.logspace(np.log10(min(lower_90_utt_lens.min(), lower_90_summ_lens.min())), np.log10(max(lower_90_utt_lens)))
plt.hist(lower_90_utt_lens, label='Utt', color='dodgerblue', bins=bins)
plt.hist(lower_90_summ_lens, label='Summary', color='darkorange', bins=bins)
plt.xscale('log')
plt.legend()
plt.title('Lower 90% document length summary and document tokens')
plt.show()

plt.clf()
_ = plt.subplots(figsize=(10, 6))
ratios = [float(u) / float(s) for u, s in zip(lower_90_utt_lens, lower_90_summ_lens) if s != 0]
bins = np.logspace(np.log10(min(ratios)), np.log10(max(ratios)))
plt.hist(ratios, color='forestgreen', bins=bins)
plt.xscale('log')
plt.title('Lower 90% document length compression ratio')
plt.show()

In [None]:
plt.clf()
_ = plt.subplots(figsize=(10, 6))
bins = np.logspace(np.log10(min(turns_per_sum)), np.log10(max(turns_per_sum)))
plt.hist(turns_per_sum, color='purple', bins=bins)
plt.xscale('log')
plt.legend()
plt.title('Turns per summary')
plt.show()

In [None]:
np.median(turns_per_sum), np.mean(turns_per_sum)

In [None]:
plt.clf()
_ = plt.subplots(figsize=(10, 6))
#bins = np.logspace(np.log10(min(chunks_per_file)), np.log10(max(chunks_per_file)))
bins = np.linspace(min(chunks_per_file), max(chunks_per_file), 50)
plt.hist(chunks_per_file, color='red', bins=bins)
#plt.xscale('log')
plt.legend()
plt.title('Chunks per file')
plt.show()

In [None]:
np.median(chunks_per_file), np.mean(chunks_per_file)

In [None]:
plt.clf()
_ = plt.subplots(figsize=(10, 6))
#bins = np.logspace(np.log10(min(turns_per_file)), np.log10(max(turns_per_file)))
bins = np.linspace(min(turns_per_file), max(turns_per_file), 50)
plt.hist(turns_per_file, color='gold', bins=bins)
#plt.xscale('log')
plt.legend()
plt.title('Turns per file')
plt.show()

In [None]:
np.median(turns_per_file), np.mean(turns_per_file)

In [None]:
plt.clf()
_ = plt.subplots(figsize=(10, 6))
#bins = np.logspace(np.log10(min(tokens_per_file)), np.log10(max(tokens_per_file)))
bins = np.linspace(min(tokens_per_file), max(tokens_per_file), 50)
plt.hist(tokens_per_file, color='aqua', bins=bins)
#plt.xscale('log')
plt.legend()
plt.title('Tokens per file')
plt.show()

In [None]:
np.median(tokens_per_file), np.mean(tokens_per_file)

In [None]:
# For estimating the number of turns to take for each summary line in non-annotated data (i.e. C3)
print((np.median(turns_per_sum), np.median(tokens_per_file) * np.median(turns_per_sum) / np.median(turns_per_file)))
print((np.mean(turns_per_sum), np.mean(tokens_per_file) * np.mean(turns_per_sum) / np.mean(turns_per_file)))
np.median(utt_lens), np.mean(utt_lens)

In [None]:
(def get_tokens():
    # Extract tokens from all the data
    tokenizer = dataset.get_tokenizer()
    tokens = defaultdict(lambda: 0)
    speaker_tokens = defaultdict(lambda: 0)

    # CRD3 Data
    for speaker_strings, utt_strings, summary_string in tqdm(dataset.iter_chunk(), total=n_chunks):
        for token in tokenizer(summary_string.lower()):
            tokens[token.text] += 1
        for token in (t for s in speaker_strings for t in tokenizer(s.lower())):
            speaker_tokens[token.text] += 1
        for token in (t for s in utt_strings for t in tokenizer(s.lower())):
            tokens[token.text] += 1

    # Campaign 3 ep. 1
    with open('../data/C3E001.txt', 'r') as f:
        for line in f:
            try:
                speaker_idx = line.index(':')
            except Exception as e:
                print(line)
                raise e
            speaker_strings = line[:speaker_idx].lower()
            speaker_strings = speaker_strings.replace('and', '')  # Remove 'and' from speakers
            utt_strings = line[speaker_idx + 1:].lower()

            for token in (t for t in tokenizer(speaker_strings)):
                speaker_tokens[token.text] += 1
            for token in (t for t in tokenizer(utt_strings)):
                tokens[token.text] += 1

    # Episode blurbs from the fandom
    blurb_df = pd.read_csv('../data/CR_blurbs.tsv', sep='\t')
    for token in (t for s in blurb_df['summary'].values.tolist() for t in tokenizer(s.lower())):
            tokens[token.text] += 1

    return list(speaker_tokens.keys()), list(tokens.keys())

In [None]:
spkr_strings, strings = get_tokens()

In [None]:
len(spkr_strings)

In [None]:
spkr_strings

In [None]:
len(strings)

In [None]:
[s for s in strings if ':' in s]

In [None]:
[s for s in strings if ']' in s or '[' in s]

In [None]:
[s for s in strings if '.' in s]

In [None]:
[s for s in strings if '-' in s]

In [None]:
CRD3_vocab = Vocab(strings=strings)
CRD3_spkr_vocab = Vocab(strings=spkr_strings)

In [None]:
len(CRD3_vocab), len(CRD3_spkr_vocab)

In [None]:
CRD3_vocab.strings[11113032409865315573]

In [None]:
CRD3_spkr_vocab.strings['matt']

In [None]:
hash_idxs = np.array([CRD3_vocab.strings[s] for s in strings])
spkr_hash_idxs = np.array([CRD3_spkr_vocab.strings[s] for s in spkr_strings])
hash_idxs.shape, spkr_hash_idxs.shape

In [None]:
np.save('../data/CRD3_vocab_hash_idxs.npy', hash_idxs)
np.save('../data/CRD3_vocab_spkr_hash_idxs.npy', spkr_hash_idxs)

In [None]:
CRD3_vocab.to_disk('../data/CRD3_vocab')
CRD3_spkr_vocab.to_disk('../data/CRD3_spkr_vocab')