In [4]:
import json
from pathlib import Path
import os
import numpy as np

In [13]:
data_path = Path('../src/dataset/all_training_data/train/')

In [14]:
def get_file_list(path, exts):
    file_names = []
    for maindir, subdir, file_name_list in os.walk(path):
        for filename in file_name_list:
            file_path = os.path.join(maindir, filename)
            ext = os.path.splitext(file_path)[1]
            if ext not in exts:
                file_names.append(file_path)
    return file_names

In [15]:
def load_json(path):
    with open(path, "r") as file:
        return json.load(file)

#### Training statistics 

In [16]:
train_jsons = get_file_list(data_path, ['.txt'])
print(len(train_jsons))

78


In [17]:
nbr_utterances = []
text_length = []

for json_path in train_jsons:
    json_dict = load_json(json_path)
    nbr_utterances.append(len(json_dict))
    text_length.extend([len(utt['text'].split(' ')) for utt in json_dict])

print('Number of utterances', nbr_utterances)
print('Mean utterances by dialogue', np.mean(nbr_utterances))
print('Mean text length', np.mean(text_length))

Number of utterances [738, 1283, 336, 557, 415, 755, 1364, 901, 1215, 593, 985, 249, 126, 648, 649, 1143, 934, 915, 358, 856, 1207, 1082, 542, 563, 960, 524, 196, 450, 357, 1487, 672, 718, 396, 647, 973, 689, 585, 1275, 860, 839, 357, 379, 838, 782, 897, 1095, 806, 717, 924, 754, 339, 1318, 669, 722, 212, 672, 765, 497, 1207, 667, 641, 772, 1377, 1057, 572, 471, 791, 486, 614, 815, 229, 515, 886, 345, 1047, 1148, 422, 547]
Mean utterances by dialogue 735.8205128205128
Mean text length 7.855559814614768


###  Train / val split  0.8 vs 0.2 

In [19]:
rng = np.random.default_rng(12345)

val_indices = rng.choice(len(train_jsons), 
                         size=int(0.2 * len(train_jsons)),
                         replace=False, shuffle=False)
print(val_indices)
print(len(val_indices))

train_indices = [idx for idx in range(len(train_jsons)) \
                    if idx not in val_indices]
print(train_indices)
print(len(train_indices))

[55 18 63 25 16 66 54 58 85 34 74 29 51 91 19 17 21 64 59]
19
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 22, 23, 24, 26, 27, 28, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 56, 57, 60, 61, 62, 65, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96]
78


In [20]:
val_nbr_utterances = []
val_text_length = []

for idx in val_indices:
    json_dict = load_json(train_jsons[idx])
    val_nbr_utterances.append(len(json_dict))
    val_text_length.extend([len(utt['text']) for utt in json_dict])
    
print('Mean nbr of utts', np.mean(val_nbr_utterances))
print('Mean text length', np.mean(val_text_length))

Mean nbr of utts 801.5263157894736
Mean text length 34.34565631361219


In [33]:
val_files = [train_jsons[idx].split('/')[-1].split('.')[0] 
             for idx in val_indices]
train_files = [train_jsons[idx].split('/')[-1].split('.')[0]
               for idx in train_indices]

In [43]:
path = Path('../data/train/')
path / 'cust'

PosixPath('../data/train/cust')

In [37]:
bin_labels_dict = load_json('../dataset/training_labels.json')

val_mean = []
train_mean = []

for k, v in bin_labels_dict.items():
    if k in val_files:
        val_mean.append(np.mean(v))
    else:
        train_mean.append(np.mean(v))
        
print('Val mean', np.mean(val_mean))
print('Train mean', np.mean(train_mean))

Val mean 0.20060656738816532
Train mean 0.19130426494393424


In [73]:
'I WlaKed in CAAR'.lower()

'i wlaked in caar'

In [3]:
!pip list

Package               Version
--------------------- ----------
certifi               2023.11.17
charset-normalizer    3.3.2
click                 8.1.7
filelock              3.13.1
fsspec                2023.12.0
huggingface-hub       0.19.4
idna                  3.6
Jinja2                3.1.2
joblib                1.3.2
jsonargparse          4.27.1
MarkupSafe            2.1.3
mpmath                1.3.0
networkx              3.2.1
nltk                  3.8.1
numpy                 1.26.2
packaging             23.2
Pillow                10.1.0
pip                   22.0.4
PyYAML                6.0.1
regex                 2023.10.3
requests              2.31.0
safetensors           0.4.1
scikit-learn          1.3.2
scipy                 1.11.4
sentence-transformers 2.2.2
sentencepiece         0.1.99
setuptools            58.1.0
sympy                 1.12
threadpoolctl         3.2.0
tokenizers            0.15.0
torch                 2.1.1
torchsummary    

In [8]:
!which python

/Users/petrshulzhenko/3-year-ecole/ml-intro/inf554-extractive-summarization-2023/.venv/bin/python


In [6]:
!python -m pip install transformers

You should consider upgrading via the '/Users/petrshulzhenko/3-year-ecole/ml-intro/inf554-extractive-summarization-2023/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [3]:
from transformers import AutoTokenizer, BertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])