<a href="https://colab.research.google.com/github/VincentZuo/fin-models/blob/main/%5BFinal%5D_v5_HF_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers
!pip install tensorflow-datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Load tfds wiki dataset

In [None]:
import tensorflow_datasets as tfds

# Loading the wikipedia dataset.
DATASET_NAME = 'wikipedia/20200301.en'

dataset, dataset_info = tfds.load(
    name=DATASET_NAME,
    data_dir='tmp',
    with_info=True,
    split=tfds.Split.TRAIN,
)

2022-05-30 22:48:18 - Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: wikipedia/20200301.en/1.0.0
2022-05-30 22:48:18 - Load dataset info from /tmp/tmpcon0n3eqtfds
2022-05-30 22:48:18 - Field info.config_name from disk and from code do not match. Keeping the one from code.
2022-05-30 22:48:18 - Field info.config_description from disk and from code do not match. Keeping the one from code.
2022-05-30 22:48:18 - Generating dataset wikipedia (tmp/wikipedia/20200301.en/1.0.0)
[1mDownloading and preparing dataset wikipedia/20200301.en/1.0.0 (download: 16.73 GiB, generated: 17.05 GiB, total: 33.77 GiB) to tmp/wikipedia/20200301.en/1.0.0...[0m
2022-05-30 22:48:19 - Dataset wikipedia is hosted on GCS. It will automatically be downloaded to your
local data directory. If you'd instead prefer to read directly from our public
GCS bucket (recommended if you're running on GCP), you can instead pass
`try_gcs=True` to `tfds.load` or set `data_dir=gs://tfds-data/datasets`.



Dl Completed...:   0%|          | 0/258 [00:00<?, ? file/s]


2022-05-30 22:49:36 - Load dataset info from tmp/wikipedia/20200301.en/1.0.0.incompleteEKLQMA
2022-05-30 22:49:36 - Field info.config_name from disk and from code do not match. Keeping the one from code.
2022-05-30 22:49:36 - Field info.config_description from disk and from code do not match. Keeping the one from code.
[1mDataset wikipedia downloaded and prepared to tmp/wikipedia/20200301.en/1.0.0. Subsequent calls will reuse this data.[0m
2022-05-30 22:49:37 - Constructing tf.data.Dataset for split train, from tmp/wikipedia/20200301.en/1.0.0


In [None]:
raw_test_dataset = dataset.take(100000) 
raw_train_dataset = dataset.take(100000)

In [None]:
import re
import string

MAX_SENT_LEN = 128

def RemovePuct(input_sent):
  return input_sent.translate(str.maketrans('', '', string.punctuation))

def break_text_to_paragraphs(input_text, padding=False):
    input_text = input_text.split("B also\n")[0]
    input_text = input_text.split("Bibliography\n")[0]
    input_text = input_text.split("References\n")[0]

    input_text = re.sub("[\(\[].*?[\)\]]", "", input_text)
    paragraphs = [para.strip() for para in input_text.split("\n") if len(para.strip()) > 50]
    all_sents = []
    all_labels = []
    for para in paragraphs:
      sents = [RemovePuct(sent).strip() for sent in re.split(r'[.?!]\s*', para) if len(sent.strip()) > 20]
      all_sents += sents
      tmp_labels = [0] * len(sents)
      if len(tmp_labels) > 0:
        tmp_labels[-1] = 1
      all_labels += tmp_labels
    if len(all_labels) > 0:
      all_labels[-1] = 0
    if not padding:
      return all_sents, all_labels
    if len(all_sents) > MAX_SENT_LEN:
      all_sents = all_sents[:MAX_SENT_LEN]
      all_labels = all_labels[:MAX_SENT_LEN]
    else:
      all_sents += ([''] * (MAX_SENT_LEN - len(all_sents)))
      all_labels += ([0] * (MAX_SENT_LEN - len(all_labels)))
    return all_sents, all_labels

def load_wiki_examples(train_dataset_in, create_df_fn, model, num_flush=5000):
    documents = []
    titles = []
    labels = []
    dfs = []
    for example_train in train_dataset_in:
      if len(documents) == num_flush:
        dfs.append(create_df_fn(titles, documents, labels, model))
        documents = []
        titles = []
        labels = []
      text_info, label_info = break_text_to_paragraphs(example_train['text'].numpy().decode('utf-8'))
      documents.append(text_info)
      labels.append(label_info)
      titles.append(example_train['title'].numpy().decode('utf-8'))
    if len(documents) > 0:
      dfs.append(create_df_fn(titles, documents, labels, model))
    return pd.concat(dfs)


# Multiprocessing embedding lookup

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler
import logging
import numpy as np
import pandas as pd
from datetime import datetime

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])


In [None]:
#Define the model
model = SentenceTransformer('all-MiniLM-L6-v2')


2022-05-30 23:17:27 - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2022-05-30 23:17:32 - Use pytorch device: cuda


In [None]:
def print_date_time():
  now = datetime.now()
  dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
  print("date and time =", dt_string)	

def create_temp_df(titles, documents, labels, model):
  print_date_time()
  all_data = []
  local_sentences = []
  for i in range(len(titles)):

    title = titles[i]
    document = documents[i]
    label = labels[i]
    sents_so_far = len(local_sentences)
    local_sentences += document
    local_data = {"title": title, "label": np.array(label), "range": (sents_so_far, sents_so_far + len(document))}
    all_data.append(local_data)
  
  #Start the multi-process pool on all available CUDA devices
  pool = model.start_multi_process_pool()
  #Compute the embeddings using the multi-process pool
  emb = model.encode_multi_process(local_sentences, pool)
  print("Embeddings computed. Shape:", emb.shape)
  #Optional: Stop the proccesses in the pool
  model.stop_multi_process_pool(pool)

  for i in range(len(all_data)):
    se = all_data[i]["range"]
    emb_data = emb[se[0]: se[1]]
    all_data[i]["emb"] = np.array(emb_data)
  print_date_time()
  return pd.DataFrame(all_data)

In [None]:
if __name__ == '__main__':
  final_df = load_wiki_examples(raw_train_dataset, create_temp_df, model, num_flush=2000)

date and time = 30/05/2022 23:18:32
2022-05-30 23:18:32 - Start multi-process pool on devices: cuda:0
2022-05-30 23:18:37 - Chunk data into packages of size 3775
Embeddings computed. Shape: (37749, 384)
date and time = 30/05/2022 23:18:52
date and time = 30/05/2022 23:18:53
2022-05-30 23:18:53 - Start multi-process pool on devices: cuda:0
2022-05-30 23:18:56 - Chunk data into packages of size 4187
Embeddings computed. Shape: (41869, 384)
date and time = 30/05/2022 23:19:11
date and time = 30/05/2022 23:19:12
2022-05-30 23:19:12 - Start multi-process pool on devices: cuda:0
2022-05-30 23:19:16 - Chunk data into packages of size 4219
Embeddings computed. Shape: (42181, 384)
date and time = 30/05/2022 23:19:31
date and time = 30/05/2022 23:19:32
2022-05-30 23:19:32 - Start multi-process pool on devices: cuda:0
2022-05-30 23:19:36 - Chunk data into packages of size 3888
Embeddings computed. Shape: (38877, 384)
date and time = 30/05/2022 23:19:50
date and time = 30/05/2022 23:19:51
2022-05-

In [None]:
final_df.count()

title    100000
label    100000
range    100000
emb      100000
dtype: int64

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
final_df.to_pickle("/content/gdrive/MyDrive/224U_2022/final_project_data.pkl")

In [None]:
!ls /content/gdrive/MyDrive/224U_2022 -l

total 3108467
drwx------ 2 root root       4096 Apr 10 03:54 data.tgz
-rw------- 1 root root 3183061353 May 30 23:35 final_project_data.pkl
drwx------ 2 root root       4096 May 22 23:48 wiki_727K


# Test create_temp_df

In [None]:
import random

#Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == '__main__':
    documents = []
    titles = []
    labels = []
    for did in range(10):
      sentences = []
      label = []
      for i in range(random.randrange(1, 101)):
        sentences.append("This is sentence {}".format(i))
        label.append(random.randint(0, 1))
      documents.append(sentences)
      labels.append(label)
      titles.append(str(did))

    temp_df = create_temp_df(titles, documents, labels, model)


date and time = 30/05/2022 22:38:15
2022-05-30 22:38:15 - Start multi-process pool on devices: cuda:0
2022-05-30 22:38:18 - Chunk data into packages of size 60
Embeddings computed. Shape: (599, 384)
0 85
[[ 0.08642844  0.09870384 -0.01381823 ...  0.03847653  0.03008671
  -0.12464089]
 [ 0.03562325  0.06595197  0.06317715 ...  0.03709536  0.00831366
  -0.10744216]
 [ 0.0622941   0.07933775  0.05761217 ...  0.02466241 -0.00415209
  -0.10233451]
 ...
 [ 0.0428837   0.13929991  0.07108785 ... -0.03191378  0.00183992
  -0.12492053]
 [ 0.04431981  0.13803938  0.0678717  ... -0.04386136  0.02712069
  -0.153329  ]
 [ 0.08373247  0.12076643  0.04378037 ... -0.05680858  0.04968407
  -0.16040577]]
85 174
[[ 0.08642844  0.09870384 -0.01381823 ...  0.03847653  0.03008671
  -0.12464089]
 [ 0.03562325  0.06595197  0.06317715 ...  0.03709536  0.00831366
  -0.10744216]
 [ 0.0622941   0.07933775  0.05761217 ...  0.02466241 -0.00415209
  -0.10233451]
 ...
 [ 0.05041501  0.12269734  0.05227924 ... -0.0139

In [None]:
temp_df

Unnamed: 0,title,label,range,emb
0,0,"[1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, ...","(0, 85)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
1,1,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, ...","(85, 174)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
2,2,"[1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...","(174, 224)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
3,3,"[1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, ...","(224, 296)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
4,4,"[1, 1, 0, 0, 1, 0, 0, 1, 1, 1]","(296, 306)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
5,5,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, ...","(306, 359)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
6,6,"[0, 0, 1]","(359, 362)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
7,7,"[0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, ...","(362, 436)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
8,8,"[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, ...","(436, 533)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
9,9,"[0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ...","(533, 599)","[[0.08642844, 0.09870384, -0.013818234, 0.0782..."
