# Notebook for using Sentence embeddings

### sources
- https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-when-using-embeddings
- https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT

In [None]:
import warnings
warnings.filterwarnings('ignore')

import json
import os
import operator
import string
import time
import re
import unicodedata
import spacy

import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
from pathlib import Path
from itertools import chain

In [None]:
json_dir = Path(r"../data/dvlog_text")
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")
embeddings_save_folder = Path(r"D:/master/data/sent-embeddings-dataset")
index_save_path = Path(r"../DVlog/dataset/")

# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)

In [None]:
# load in the synonym keywords
depr_synonyms_file1 = Path(r"../data/depression_synonyms_gizem.json")
depr_synonyms_file2 = Path(r"../data/depression_synonyms_paper.json")

# load in the files and combine them into a single list with keywords
with open(depr_synonyms_file1) as current_file:
    depri_synonyms = list(json.loads(current_file.read()).get("depression"))

with open(depr_synonyms_file2) as current_file:
    depri2 = list(json.loads(current_file.read()).get("depression"))

# combine them and remove duplicates
depri_synonyms.extend(depri2)
depri_synonyms = list(set(depri_synonyms))
len(depri_synonyms)

In [None]:
# loop over each text file and extract the text
text_ref_dict = {}
for json_file in os.listdir(json_dir):
    
    # get the video_id and setup the path to the file
    video_id = int(json_file.split("_")[0])
    json_path = os.path.join(json_dir, json_file)
    
    with open(json_path) as current_file:
        json_dict = json.loads(current_file.read())

    text_ref_dict[video_id] = {
        "text": json_dict["text"],
        "text_segments": [x.get("text") for x in json_dict["segments"]],
        "timestamped_text_segments": [(x.get("text"), x.get("start"), x.get("end")) for x in json_dict["segments"]]
    }

# put the annotations back into the dataframe
# df_annotations["text"] = df_annotations["video_id"].apply(lambda x: text_ref_dict.get(x).get("text"))

In [None]:
def clean_text(text: str, unicode_pattern: str = "NFKD") -> str:
    text = text.replace("\n", "").strip()  # Remove newlines and trailing whitespace
    text = text.translate(str.maketrans("", "", string.digits))  # Remove all numbers with lookup table
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation with lookup table
    text = " ".join(text.split()) # Remove excess whitespace in between words
    text = unicodedata.normalize(unicode_pattern, text)  # Strip accents from characters

    return text

In [None]:
def retrieve_sent_embeddings(model, text_dict: dict, df: pd.DataFrame, feature_name: str,
                             save_folder: Path, depri_keywords: list):
    """
    """
    for index, row in df.iterrows():
        # get the texts
        video_id = row.video_id
        texts = text_dict.get(video_id).get("text_segments")

        # clean the texts
        texts = [clean_text(text) for text in texts if len(clean_text(text).split()) > 1]

        if depri_keywords:
            sentences = []
            # check for each keyword on each sentence
            for sentence in texts:
                skip_sentence = False
                for keyw in depri_keywords:
                    if keyw in sentence:
                        skip_sentence = True
                        break
    
                if not skip_sentence:
                    sentences.append(sentence)

        else:
            sentences = texts    

        # put it through the model
        embeddings = model.encode(sentences)

        # save the embedding
        subject_output_path = os.path.join(save_folder, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)

        np.save(os.path.join(subject_output_path, f"{feature_name}.npy"), embeddings)

## Building the indexation file


In [None]:
sync_index_dict = {}
sync_keyw_index_dict = {}
save_index_files = True

for index, row in df_annotations.iterrows():
    # get the timestamped files
    video_id = row.video_id
    texts = text_ref_dict.get(video_id).get("timestamped_text_segments")
    
    # clean the texts
    texts = [(clean_text(text), start_t, end_t) for text, start_t, end_t in texts if len(clean_text(text).split()) > 1]
    
    # save the normal cleaned text timestamps
    sync_index_dict[video_id] = [(start_t, end_t) for _, start_t, end_t in texts]
    
    # do the keyword removal process and save the timestamps of the remaining sentences
    t_sentences = []
    
    for sentence, start_t, end_t in texts:
        skip_sentence = False
        for keyw in depri_synonyms:
            if keyw in sentence:
                skip_sentence = True
                break
        
        if not skip_sentence:
            # save the timestamps of the sentence
            t_sentences.append((start_t, end_t))

    # save the keyword removed text timestamps
    sync_keyw_index_dict[video_id] = t_sentences

if save_index_files:
    # save both files
    with open(os.path.join(index_save_path, f"sync_index_normal.json"), 'w') as f:
        json.dump(sync_index_dict, f)

    with open(os.path.join(index_save_path, f"sync_index_keyw.json"), 'w') as f:
        json.dump(sync_keyw_index_dict, f)

## SBERT (all-mpnet-base-v2)
- https://sbert.net/

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"
run_mpnet_sbert = True

if run_mpnet_sbert:
    model = SentenceTransformer(model_name)

In [None]:
output_feature_normal_name = "sent_mpnet_normal"
output_feature_keyw_name = "sent_mpnet_keyw"

# run the sentence embedding process
if run_mpnet_sbert:
    # get the normal embeddings
    retrieve_sent_embeddings(model, text_ref_dict, df_annotations, output_feature_normal_name,
                             embeddings_save_folder, [])

    # get the filtered embeddings
    retrieve_sent_embeddings(model, text_ref_dict, df_annotations, output_feature_keyw_name,
                             embeddings_save_folder, depri_synonyms)

### spaCy experiment
- https://stackoverflow.com/questions/46290313/how-to-break-up-document-by-sentences-with-spacy

In [None]:
output_feature_name = "sent_mpnet_spacy_normal"

if run_mpnet_sbert:
    nlp = spacy.load("en_core_web_sm")

    for index, row in df_annotations.iterrows():
        # get the texts
        video_id = row.video_id
        text = text_ref_dict.get(video_id).get("text")

        # extract the spaCy made sentence embeddings
        with nlp.select_pipes(enable=['tok2vec', "parser", "senter"]):
            doc = nlp(text)

        sentences = [clean_text(sent.text) for sent in doc.sents if len(clean_text(sent.text).split()) > 1]

        # put it through the model
        embeddings = model.encode(sentences)

        # save the embedding
        subject_output_path = os.path.join(embeddings_save_folder, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)
    
        np.save(os.path.join(subject_output_path, f"{output_feature_name}.npy"), embeddings)

In [None]:
output_feature_name = "sent_mpnet_spacy_keyw"

if run_mpnet_sbert:
    nlp = spacy.load("en_core_web_sm")

    for index, row in df_annotations.iterrows():
        # get the texts
        video_id = row.video_id
        text = text_ref_dict.get(video_id).get("text")

        # extract the spaCy made sentence embeddings
        with nlp.select_pipes(enable=['tok2vec', "parser", "senter"]):
            doc = nlp(text)

        texts = [clean_text(sent.text) for sent in doc.sents if len(clean_text(sent.text).split()) > 1]

        sentences = []
        # check for each keyword on each sentence
        for sentence in texts:
            skip_sentence = False
            for keyw in depri_synonyms:
                if keyw in sentence:
                    skip_sentence = True
                    break

            if not skip_sentence:
                sentences.append(sentence)

        # put it through the model
        if len(sentences) == 0:
            embeddings = np.zeros((1, model.get_sentence_embedding_dimension()))
        else:
            embeddings = model.encode(sentences)
        print(f"{video_id} - {len(sentences)}; {embeddings.shape}")

        # save the embedding
        subject_output_path = os.path.join(embeddings_save_folder, str(video_id))
        os.makedirs(subject_output_path, exist_ok=True)
    
        np.save(os.path.join(subject_output_path, f"{output_feature_name}.npy"), embeddings)

## SBERT (all-MiniLM-L12-v2)
- 

In [None]:
model_name = "all-MiniLM-L12-v2"
run_minilm_sbert = True

if run_minilm_sbert:
    model = SentenceTransformer(model_name)

In [None]:
output_feature_normal_name = "sent_minilm_normal"
output_feature_keyw_name = "sent_minilm_keyw"

# run the sentence embedding process
if run_minilm_sbert:
    # get the normal embeddings
    retrieve_sent_embeddings(model, text_ref_dict, df_annotations, output_feature_normal_name,
                             embeddings_save_folder, [])

    # get the filtered embeddings
    retrieve_sent_embeddings(model, text_ref_dict, df_annotations, output_feature_keyw_name,
                             embeddings_save_folder, depri_synonyms)