In [1]:
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertModel, DistilBertTokenizer
import Levenshtein as lev
import numpy as np
import os
import pandas as pd
import re
import torch
import yaml

  from .autonotebook import tqdm as notebook_tqdm
Downloading model.safetensors: 100%|██████████| 268M/268M [00:02<00:00, 104MB/s]  


In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load pre-trained model (weights)
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# Load previously saved embeddings
embeddings = np.load('../resources/sentence_embeddings.npy')

In [None]:
def load_yaml_data(folder_path):
    constructs_list = []
    hypotheses_data = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.yaml') or file_name.endswith('.yml'):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, 'r') as file:
                yaml_content = yaml.safe_load(file)

                # Extract constructs
                constructs = yaml_content.get('constructs', {})
                constructs_list.extend(constructs.values())

                # Extract hypotheses
                for hypothesis in yaml_content.get('hypotheses', {}).values():
                    cause = constructs.get(hypothesis['cause'], '')
                    effect = constructs.get(hypothesis['effect'], '')
                    hypotheses_data.append({'cause': cause, 'effect': effect})

    # Remove duplicates from constructs list
    constructs_list = list(set(constructs_list))

    # Create DataFrame for hypotheses
    hypotheses_df = pd.DataFrame(hypotheses_data)

    return constructs_list, hypotheses_df

In [None]:
# Specify the folder path
folder_path = '../true_results'

# Load the constructs and hypotheses
constructs, hypotheses_df = load_yaml_data(folder_path)

In [None]:
# Remove empty elements from constructs list
constructs = [construct for construct in constructs if construct]
print(constructs)
print("Unique constructs: " + str(len(constructs)))

In [None]:
hypotheses_df

In [None]:
# Cleaning function for construct terms
def clean_terms(terms):
    cleaned_terms = []
    for term in terms:
        # Lowercasing
        term = term.lower()
        # Removing special characters (preserving spaces)
        term = re.sub(r'[^\w\s]', '', term)
        cleaned_terms.append(term)
    return cleaned_terms

# Cleaning the lists
cleaned_constructs = clean_terms(constructs)

# Print cleaned data
print("Cleaned Constructs:", cleaned_constructs)

In [None]:
# Tokenize new terms
tokenized_constructs = [tokenizer.encode(term, add_special_tokens=True) for term in cleaned_constructs]

# Pad the tokenized sentences
max_len_new_terms = max([len(term) for term in tokenized_constructs])
padded_new_terms = np.array([term + [0]*(max_len_new_terms-len(term)) for term in tokenized_constructs])

# Create attention masks for new terms
attention_mask_new_terms = np.where(padded_new_terms != 0, 1, 0)

# Convert to tensors
input_ids_new_terms = torch.tensor(padded_new_terms)  
attention_mask_new_terms = torch.tensor(attention_mask_new_terms)


In [None]:
# Get embeddings for constructs
with torch.no_grad():
    last_hidden_states_new_terms = model(input_ids_new_terms, attention_mask=attention_mask_new_terms)

embeddings_constructs = last_hidden_states_new_terms[0][:,0,:].numpy()


In [None]:
# Calculate cosine similarities within new constructs
similarities = cosine_similarity(embeddings_constructs, embeddings_constructs)

In [None]:
for i, term in enumerate(cleaned_constructs):
    # Sort the similarities, ignore the first one as it is the term itself
    sorted_indices = np.argsort(similarities[i])[::-1][1:]

    print(f"Term: {term}")
    print("Most similar terms within new_terms:")
    for idx in sorted_indices:
        # Avoid comparing the term with itself
        if idx == i:
            continue

        similar_term = cleaned_constructs[idx]
        score = similarities[i][idx]
        print(f" - {similar_term} (Score: {score:.4f})")
        # You can limit the number of similar terms shown, e.g., top 3
        # if idx >= 3:
        #     break
    print("\n")
