In [1]:
!pip install sentence-transformers langchain tqdm pandas



In [17]:
# import required libraries
import numpy as np
from numpy.linalg import norm

import csv, json
import pandas as pd
from tqdm import tqdm

In [11]:

 
# define two lists or array
A = np.array([2,1,2,3,2,9])
B = np.array([3,4,2,4,5,5])
 
print("A:", A)
print("B:", B)
 
# compute cosine similarity
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)


A: [2 1 2 3 2 9]
B: [3 4 2 4 5 5]
Cosine Similarity: 0.8188504723485274


### Data

In [2]:
hamlet_path = '/home/elte-dh-coba/dev/info-drama/hamlet_sent_split.tsv'
hamlet_data = []
with open(hamlet_path) as fh:
    for l in fh.readlines():
        hamlet_data.append(tuple(l.strip().split('\t')))

In [3]:
sentences = [l[0] for l in hamlet_data]
creation_times = [l[1] for l in hamlet_data]
characters = [l[2] for l in hamlet_data]

### Zsombor method

In [1]:
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [5]:
# Model choice
all_mini_lm_embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # or all-mpnet-base-v2
sentence_embeddings = all_mini_lm_embedder.embed_documents(sentences)

In [34]:
with open('hamlet_embeddings_all_macBERTh_from_hamelt_sent_split.tsv', 'w', encoding='utf-8') as fh:
    writer = csv.writer(fh, delimiter='\t')
    writer.writerow(['created_at','character','speech_sent','embeddings'])
    
    for (sentence, created_at, character), sent_embedding in zip(hamlet_data, sentence_embeddings):
        writer.writerow([created_at, character, sentence, list(sent_embedding)])

### MacBERTh tests

In [7]:
macberth_embedder = HuggingFaceEmbeddings(model_name="/media/elte-dh-coba/TOSHIBA EXT/languate_models/MacBERTh")

No sentence-transformers model found with name /media/elte-dh-coba/TOSHIBA EXT/languate_models/MacBERTh. Creating a new one with MEAN pooling.


In [2]:
macberth_embedder = HuggingFaceEmbeddings(model_name="emanjavacas/MacBERTh")

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/770 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/227k [00:00<?, ?B/s]

No sentence-transformers model found with name /home/elte-dh-coba/.cache/torch/sentence_transformers/emanjavacas_MacBERTh. Creating a new one with MEAN pooling.


In [14]:
mac_embs = macberth_embedder.embed_documents(first_4)

In [24]:
a_sent = "Rest, rest, perturbed spirit!"
b_sent = 'Could beauty, my lord, have better commerce than with honesty?'

In [18]:
A_B = macberth_embedder.embed_documents([a_sent, b_sent])

NameError: name 'a_sent' is not defined

In [26]:
A, B = A_B

In [21]:
a, b = mac_embs[0], mac_embs[2]

In [22]:
cosine = np.dot(a, b)/(norm(a)*norm(b))

In [23]:
cosine

0.842642856686389

### All_min_lm_l6_l2 test

In [29]:
all_min_lm_l6_v2_embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [31]:
a_sent = "Rest, rest, perturbed spirit!"
b_sent = 'Could beauty, my lord, have better commerce than with honesty?'
A_B = all_min_lm_l6_v2_embedder.embed_documents([a_sent, b_sent])
A, B = A_B
cosine = np.dot(A, B)/(norm(A)*norm(B))
print(cosine)

0.1447974997661779


### Sentences to compare

In [5]:
sentences_to_compare = []
with open('/home/elte-dh-coba/dev/info-drama/sentences-to-compare.txt') as fh:
    for line in fh.readlines():
        line = line.strip()
        if line != '':
            sentences_to_compare.append(line)
    

In [8]:
first_4 = sentences_to_compare[:4]

In [6]:
# all-MiniLM-L6-v2
all_min_lm_l6_v2_embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # or all-mpnet-base-v2
sentences_to_compare_minilm_embeddings = all_min_lm_l6_v2_embedder.embed_documents(sentences_to_compare)

In [8]:
# sentence-transformers/all-roberta-large-v1
all_roberta_large_v1_embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-roberta-large-v1")
sentences_to_compare_all_roberta_large_embeddings = all_roberta_large_v1_embedder.embed_documents(sentences_to_compare)

In [6]:
# Pclanglais/MonadGPT
monadgpt_embedder = HuggingFaceEmbeddings(cache_folder="/media/elte-dh-coba/TOSHIBA EXT/languate_models/monadGPT")
# sentences_to_compare_monadGPT_embeddings = monadgpt_embedder.embed_documents(sentences_to_compare)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# MacBERTh
macberth_embedder = HuggingFaceEmbeddings(cache_folder="/media/elte-dh-coba/TOSHIBA EXT/languate_models/MacBERTh")
# sentences_to_compare_macberth_embeddings = macberth_embedder.embed_documents(sentences_to_compare)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [7]:
# sentence-transformers/all-mpnet-base-v2
all_mpnet_base_v2_embedder = HuggingFaceEmbeddings(cache_folder="/media/elte-dh-coba/TOSHIBA EXT/languate_models/all-mpnet-base-v2") # or all-mpnet-base-v2
sentences_to_compare_mpnet_embeddings = all_mpnet_base_v2_embedder.embed_documents(sentences_to_compare)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [14]:
# create dataframe
sent_comp_df_data = {'sentence': sentences_to_compare, 'all-roberta-large-v1': sentences_to_compare_all_roberta_large_embeddings,
 'all-MiniLM-L6-v2': sentences_to_compare_minilm_embeddings, 'MonadGPT': sentences_to_compare_monadGPT_embeddings,
 'MacBERTh': sentences_to_compare_macberth_embeddings, 'all-mpnet-base-v2': sentences_to_compare_mpnet_embeddings}

In [15]:
compare_df = pd.DataFrame(sent_comp_df_data, index=list(range(0, len(sentences_to_compare))))

In [16]:
compare_df.to_csv('sentences_to_compare_model_results_v2.tsv', sep='\t')

In [18]:
b = pd.read_csv('sentences_to_compare_model_results_v2.tsv', sep='\t')
a = pd.read_csv('sentences_to_compare_model_results.tsv', sep='\t')

### Sentence-transformer method

In [10]:
# !pip install sentence-transformers
from sentence_transformers import SentenceTransformer

In [4]:
model = SentenceTransformer('SZTAKI-HLT/hubert-base-cc')
e1 = model.encode('A nagy fogú macska')
e2 = model.encode('A nagy fogú cica')

.gitattributes:   0%|          | 0.00/437 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/272k [00:00<?, ?B/s]

No sentence-transformers model found with name /home/elte-dh-coba/.cache/torch/sentence_transformers/SZTAKI-HLT_hubert-base-cc. Creating a new one with MEAN pooling.


In [6]:
e1 = model.encode('A nagy fogú macska')
e2 = model.encode('A nagy fogú cica')

In [46]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('llmrails/ember-v1')

Downloading (…)27441/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)6e2f327441/README.md:   0%|          | 0.00/6.19k [00:00<?, ?B/s]

Downloading (…)41/added_tokens.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)2f327441/config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [19]:
embedding = model.encode(hamlet_data[0][0])

In [45]:
with open('hamlet_embeddings_ember1.csv', 'w', encoding='utf-8') as fh:
    writer = csv.writer(fh)
    writer.writerow(['created_at','character','speech_sent','embeddings'])
    
    for sentence, created_at, character in tqdm(hamlet_data):
        embedding = model.encode(sentence)
        writer.writerow([created_at, character, sentence, list(embedding)])
        

100%|██████████████████████████████████████████████████████████████████| 2510/2510 [12:17<00:00,  3.40it/s]


In [None]:
model = SentenceTransformer('/media/elte-dh-coba/TOSHIBA EXT/languate_models/monadGPT/sentence-transformers')

No sentence-transformers model found with name /media/elte-dh-coba/TOSHIBA EXT/languate_models/monadGPT/. Creating a new one with MEAN pooling.
