# Notebook for using Sentence embeddings

### sources
- https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-when-using-embeddings
- https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT

In [4]:
import warnings
warnings.filterwarnings('ignore')

import json
import os
import operator
import string
import time
import re

import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from itertools import chain

In [2]:
json_dir = Path(r"../data/dvlog_text")
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")
embeddings_save_folder = Path(r"E:/master/data/embeddings-dataset")

# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)

In [5]:
# loop over each text file and extract the text
text_ref_dict = {}
for json_file in os.listdir(json_dir):
    
    # get the video_id and setup the path to the file
    video_id = int(json_file.split("_")[0])
    json_path = os.path.join(json_dir, json_file)
    
    with open(json_path) as current_file:
        json_dict = json.loads(current_file.read())

    text_ref_dict[video_id] = {
        "text": json_dict["text"],
        "text_segments": [x.get("text") for x in json_dict["segments"]],
        "words": [(x.get("text"), x.get("start"), x.get("end")) for x in list(chain.from_iterable([x.get("words") for x in json_dict["segments"]]))]
    }

# put the annotations back into the dataframe
df_annotations["text"] = df_annotations["video_id"].apply(lambda x: text_ref_dict.get(x).get("text"))

# load in the transformers library

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


## Clinical-BERT sentence embeddings

In [7]:
# fix misspelled words
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispell_dict = {
    'colour':'color',
    'didnt':'did not',
    'doesnt':'does not',
    'isnt':'is not',
    'hasnt': 'has not',
    'shouldnt':'should not',
    'wasnt': 'was not',
    'instagram': 'social medium',
    'whatsapp': 'social medium',
    'snapchat': 'social medium'
}

mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


# we clean the text on punctuation, newlines and trailing whitespace.
# we also remove 
def clean_text(text: str, mispelled_func: Callable) -> str:
    text = text.replace("\n", "").strip()  # Remove newlines and trailing whitespace
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove puctuation with lookup table
    text = text.lower()  # Lowercase
    text = " ".join(text.split())

    # clean the numbers
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)

    # fix misspellings
    text = mispelled_func(text)

    return text

In [11]:
# extract the embeddings without extra keywords
keyw_to_remove = ['a', 'to', 'of', 'and']

if run_word2vec:
    # retrieve the embedding features with averaged unknown vectors
    extract_text_segments(df_annotations, text_ref_dict, w2v_embeddings_index,
                          keyw_to_remove, embeddings_save_folder, "w2v_seconds_normal_avg")

    # retrieve the embedding features with unknown zero-vectors
    extract_text_segments(df_annotations, text_ref_dict, w2v_embeddings_index,
                          keyw_to_remove, embeddings_save_folder, "w2v_seconds_normal_zero", use_avg=False)

In [12]:
# extract the embedding with extra keywords
depression_keywords = ["depression", "depressive", "antidepressant", "depressed", "anxiety", "psychiatrist", "ptsd"]
depression_keywords.extend(keyw_to_remove)

if run_word2vec:
    extract_text_segments(df_annotations, text_ref_dict, w2v_embeddings_index,
                          depression_keywords, embeddings_save_folder, "w2v_seconds_keyw")

In [13]:
# get the averaged text embeddings
output_feature_name = "w2v_seq_avg"

if run_word2vec:
    for index, row in df_annotations.iterrows():
    
        video_id = row.video_id
        final_embeddings = []
    
        texts = text_ref_dict.get(video_id).get("text_segments")
        for text in texts:
            # clean up the words
            cleaned_text = clean_text(text, replace_typical_misspell).split()
    
            # remove some of the keywords
            to_remove = ['a', 'to', 'of', 'and']
            cleaned_text = [word for word in cleaned_text if not word in to_remove]
    
            # disregard sentences with single words
            if len(cleaned_text) <= 1:
                continue
            else:
                # get the embedding
                embedding = w2v_embeddings_index.get_mean_vector(cleaned_text)
                final_embeddings.append(embedding)
                
        # save the embedding
        final_embeddings = np.array(final_embeddings)

        subject_output_path = os.path.join(embeddings_save_folder, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)

        np.save(os.path.join(subject_output_path, f"{output_feature_name}.npy"), final_embeddings)

## BioWordVec

In [14]:
run_biowordvec = True

if run_biowordvec:
    biowordvec_path = Path(r"E:/master/embedding_models/bio_embedding_extrinsic")
    assert os.path.exists(biowordvec_path), "embedding model not found"

    bio_embeddings_index = KeyedVectors.load_word2vec_format(biowordvec_path, binary=True)

In [17]:
oov = check_coverage(vocab, bio_embeddings_index)

100%|████████████████████████████████████████████████████████████████████████| 20988/20988 [00:00<00:00, 277681.56it/s]

Found embeddings for 88.10% of vocab
Found embeddings for  98.57% of all text





In [15]:
# retrieve and save the embeddings#
embeddings_save_folder = Path(r"E:/master/data/embeddings-dataset")

# extract the embeddings without extra keywords
keyw_to_remove = ['a', 'to', 'of', 'and']

if run_biowordvec:
    # retrieve the embedding features with averaged unknown vectors
    extract_text_segments(df_annotations, text_ref_dict, bio_embeddings_index,
                          keyw_to_remove, embeddings_save_folder, "biow_seconds_normal_avg")

In [16]:
# get the averaged text embeddings
output_feature_name = "biow_seq_avg"

if run_biowordvec:
    for index, row in df_annotations.iterrows():
    
        video_id = row.video_id
        final_embeddings = []
    
        texts = text_ref_dict.get(video_id).get("text_segments")
        for text in texts:
            # clean up the words
            cleaned_text = clean_text(text, replace_typical_misspell).split()
    
            # remove some of the keywords
            to_remove = ['a', 'to', 'of', 'and']
            cleaned_text = [word for word in cleaned_text if not word in to_remove]
    
            # disregard sentences with single words
            if len(cleaned_text) <= 1:
                continue
            else:
                # get the embedding
                embedding = bio_embeddings_index.get_mean_vector(cleaned_text)
                final_embeddings.append(embedding)
                
        # save the embedding
        final_embeddings = np.array(final_embeddings)

        subject_output_path = os.path.join(embeddings_save_folder, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)

        np.save(os.path.join(subject_output_path, f"{output_feature_name}.npy"), final_embeddings)