# Notebook for using Word and Sentence embeddings

### sources
- https://radimrehurek.com/gensim/models/keyedvectors.html
- https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-when-using-embeddings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import gensim.downloader
import json
import os
import operator
import string
import time
import re
import math

import pandas as pd
import numpy as np

from collections import defaultdict
from gensim.models import KeyedVectors
from pathlib import Path
from tqdm import tqdm
from typing import Callable
from itertools import chain

In [2]:
json_dir = Path(r"../data/dvlog_text")
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")

# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)

# setting up some functions to check the coverage of the word embeddings

In [3]:
def build_vocab(sentences, verbose = True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = defaultdict(int)
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            vocab[word] += 1
    return vocab

In [4]:
def check_coverage(vocab, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

## Word2vec (Google-news)

In [42]:
run_word2vec = True

if run_word2vec:
    google_news_path = Path(r"E:/master/embedding_models/GoogleNews-vectors-negative300.bin")
    assert os.path.exists(google_news_path), "embedding model not found"

    w2v_embeddings_index = KeyedVectors.load_word2vec_format(google_news_path, binary=True)

In [58]:
# loop over each text file and extract the text
text_ref_dict = {}
for json_file in os.listdir(json_dir):
    
    # get the video_id and setup the path to the file
    video_id = int(json_file.split("_")[0])
    json_path = os.path.join(json_dir, json_file)
    
    with open(json_path) as current_file:
        json_dict = json.loads(current_file.read())

    text_ref_dict[video_id] = {
        "text": json_dict["text"],
        "text_segments": [x.get("text") for x in json_dict["segments"]],
        "words": [(x.get("text"), x.get("start"), x.get("end")) for x in list(chain.from_iterable([x.get("words") for x in json_dict["segments"]]))]
    }

# put the annotations back into the dataframe
df_annotations["text"] = df_annotations["video_id"].apply(lambda x: text_ref_dict.get(x).get("text"))

In [44]:
# fix misspelled words
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispell_dict = {
    'colour':'color',
    'didnt':'did not',
    'doesnt':'does not',
    'isnt':'is not',
    'hasnt': 'has not',
    'shouldnt':'should not',
    'wasnt': 'was not',
    'instagram': 'social medium',
    'whatsapp': 'social medium',
    'snapchat': 'social medium'
}

mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)


# we clean the text on punctuation, newlines and trailing whitespace.
# we also remove 
def clean_text(text: str, mispelled_func: Callable) -> str:
    text = text.replace("\n", "").strip()  # Remove newlines and trailing whitespace
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove puctuation with lookup table
    text = text.lower()  # Lowercase
    text = " ".join(text.split())

    # clean the numbers
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)

    # fix misspellings
    text = mispelled_func(text)

    return text

In [45]:
# select the sentences and build the vocab
df_annotations["cleaned_text"] = df_annotations["text"].apply(lambda x: clean_text(x, replace_typical_misspell))
sentences = df_annotations["cleaned_text"].apply(lambda x: x.split())
to_remove = ['a', 'to', 'of', 'and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████████████████████████████████████████████████████████████████████████| 827/827 [00:00<00:00, 3954.26it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 827/827 [00:00<00:00, 3542.22it/s]


In [46]:
oov = check_coverage(vocab,embeddings_index)

100%|█████████████████████████████████████████████████████████████████████████| 20988/20988 [00:01<00:00, 17564.19it/s]

Found embeddings for 87.96% of vocab
Found embeddings for  99.60% of all text





### Build the word2vec embeddings

In [51]:
def extract_text_segments(df, text_ref_dict, embeddings_index, keyword_removal, features_output_path: Path, output_feature_name: str):
    # function to get the per-seconds embeddings where we only look at the start time of the word
    assert os.path.exists(features_output_path), "output directory does not exist"

    for index, row in df.iterrows():
    
        video_id = row.video_id
        words_list = text_ref_dict.get(video_id).get("words")
    
        start_time = math.floor(words_list[0][1])
        end_time = math.floor(words_list[-1][1])
        # print(video_id, words_list[0], words_list[-1], start_time, end_time)
    
        # set up the dictionary to store the word embeddings
        embed_dict = {}
    
        for i in range(start_time, end_time + 1):
            embed_dict[i] = []
    
        # go over all words, clean them and put them in the dictionary
        for word, start_time, end_time in words_list:
    
            word_i = math.floor(start_time)
            cleaned_word = clean_text(word, replace_typical_misspell)

            # remove keywords
            if cleaned_word in keyword_removal:
                continue
    
            if len(cleaned_word.split(" ")) == 2:
                # some words when cleaned up are now two words so handle them appropriatly
                cleaned1, cleaned2 = cleaned_word.split(" ")
                if embeddings_index.has_index_for(cleaned1):
                    embed_dict[word_i].append(embeddings_index[cleaned1])
    
                if embeddings_index.has_index_for(cleaned2):
                    embed_dict[word_i].append(embeddings_index[cleaned2])
    
            else:
                if embeddings_index.has_index_for(cleaned_word):
                    # check if we have an index for the word (otherwise we ignore it)
                    embed_dict[word_i].append(embeddings_index[cleaned_word])
    
        # now we want to average all embeddings on all times
        embeddings = sorted([(k, np.array(v)) for k,v in embed_dict.items()], key=lambda a: a[0])
        embeddings = [np.mean(v, axis=0) for k,v in embeddings]
    
        # handle the vectors for which no information is known by averaging the neighboring vectors
        final_embeddings = [np.nan for x in embeddings]
        last_true_embed = np.nan
    
        for i, embed in enumerate(embeddings):
    
            if not np.isnan(final_embeddings[i]).any():
                # check if this is already filled in (for an averaged vector this is the case)
                continue
    
            else:
                if not np.isnan(embed).any():
                    # we have a mean embedding so just store it and continue
                    final_embeddings[i] = embeddings[i]
                    last_true_embed = embeddings[i]
    
                else:
                    # we don't have an embedding so we have to average with one
                    found_end_embed = False
                    curr_index = i + 1
    
                    while curr_index < len(embeddings):
    
                        if not np.isnan(embeddings[curr_index]).any():
                            # we found the end of the gap
                            # print(last_true_embed.shape, embeddings[curr_index].shape)
                            if np.isnan(last_true_embed).any():
                                # when the first token is None, we take the first found embedding as that token
                                last_true_embed = embeddings[curr_index]
    
                            avg_embed = np.mean(np.array([last_true_embed, embeddings[curr_index]]), axis=0)
                            # set it to the list
                            for x in range(i, curr_index):
                                final_embeddings[x] = avg_embed
    
                            # break out of the while loop since the average embedding have been added
                            found_end_embed = True
                            break
                        else:
                            curr_index += 1
    
                    if not found_end_embed:
                        # the NaN value was not encapsulated by two known embeddings, so take the last known embedding and just use that one
                        for x in range(i, len(embeddings)):
                            final_embeddings[x] = last_true_embed
    
        # set the vectors to a numpy 2d array and save it
        final_embedding = np.array(final_embeddings)

        # save it
        subject_output_path = os.path.join(features_output_path, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)

        np.save(os.path.join(subject_output_path, f"{output_feature_name}.npy"), final_embedding)

In [52]:
# retrieve and save the embeddings#
embeddings_save_folder = Path(r"E:/master/data/embeddings-dataset")

# extract the embeddings without extra keywords
keyw_to_remove = ['a', 'to', 'of', 'and']

if run_word2vec:
    extract_text_segments(df_annotations, text_ref_dict, w2v_embeddings_index,
                          keyw_to_remove, embeddings_save_folder, "w2v_seconds_normal")

In [54]:
# extract the embedding with extra keywords
depression_keywords = ["depression", "depressive", "antidepressant", "depressed", "anxiety", "psychiatrist", "ptsd"]
depression_keywords.extend(keyw_to_remove)

if run_word2vec:
    extract_text_segments(df_annotations, text_ref_dict, w2v_embeddings_index,
                          depression_keywords, embeddings_save_folder, "w2v_seconds_keyw")

In [71]:
# get the averaged text embeddings
output_feature_name = "w2v_seq_avg"

if run_word2vec:
    for index, row in df_annotations.iterrows():
    
        video_id = row.video_id
        final_embeddings = []
    
        texts = text_ref_dict.get(video_id).get("text_segments")
        for text in texts:
            # clean up the words
            cleaned_text = clean_text(text, replace_typical_misspell).split()
    
            # remove some of the keywords
            to_remove = ['a', 'to', 'of', 'and']
            cleaned_text = [word for word in cleaned_text if not word in to_remove]
    
            # disregard sentences with single words
            if len(cleaned_text) <= 1:
                continue
            else:
                # get the embedding
                embedding = w2v_embeddings_index.get_mean_vector(cleaned_text)
                final_embeddings.append(embedding)
                
        # save the embedding
        final_embeddings = np.array(final_embeddings)

        subject_output_path = os.path.join(embeddings_save_folder, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)

        np.save(os.path.join(subject_output_path, f"{output_feature_name}.npy"), final_embeddings)

## BioWordVec

In [17]:
run_biowordvec = True

if run_biowordvec:
    biowordvec_path = Path(r"E:/master/embedding_models/bio_embedding_extrinsic")
    assert os.path.exists(biowordvec_path), "embedding model not found"

    bio_embeddings_index = KeyedVectors.load_word2vec_format(biowordvec_path, binary=True)

## BioWordSent