In [1]:
!pip install flair tqdm nltk >> /dev/null


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import torch
import torch.nn.functional as F
import nltk
from flair.embeddings import TransformerWordEmbeddings
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import json
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/tommy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
def load_dictionary(path:str = './gold_dictionary.jsonl'):
    import json
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        return [json.loads(js) for js in lines]

dictionary = load_dictionary()
print(dictionary)

## Sentence embedding

In [55]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
# sentences = ['Mi sono pisciato addosso.', 'Mi sono cagato nei pantaloni.']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('nickprock/sentence-bert-base-italian-xxl-uncased')
model = AutoModel.from_pretrained('nickprock/sentence-bert-base-italian-xxl-uncased').to('cpu')
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2').to('cpu')
def embed(sentences: list[str]):
  # Tokenize sentences
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to('cpu')

  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)

  # Perform pooling. In this case, mean pooling.
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  return sentence_embeddings.to('cpu')

In [34]:
### test
embeddings = embed(['che cosa mi sta succedendo?', 'che è?'])
F.cosine_similarity(embeddings[0], embeddings[1], dim=0)

tensor(0.4948)

In [56]:
embedding_sentencies_sorianesi = []

for i, d in enumerate(dictionary):
  sent = ''
  key = d['it']
  value = d['notes']
  if key != '':
    sent = key.strip()
  elif value != '':
    sent = value.strip()
  # if key == '':
  #   sent = value
  # elif value != '':
  #   sent = key + ': ' + value
  # else:
  #   sent = key
  embedding_sentencies_sorianesi.append((sent, i))

embs = []
for x in tqdm([e[0] for e in embedding_sentencies_sorianesi]):
  embs.append(embed(x).squeeze(0))

embedding_sentencies_sorianesi = torch.stack(embs)
embedding_sentencies_sorianesi.shape
# embedding_sentencies_sorianesi = torch.tensor(embs)

100%|██████████| 1820/1820 [01:58<00:00, 15.39it/s]


torch.Size([1820, 768])

In [57]:
sentences= [
      # "Ho dato da mangiare ai maiali versando il mangime nel recipiente",
      # "Ti tiro un pugno fortissimo",
      "Ho visto Anna andare a funghi con lo zio peppe"
      ]

Try to get dictionary entities to use in the dataset sentence

In [8]:
list(nltk.trigrams(nltk.word_tokenize('ciao come stai?')))

[('ciao', 'come', 'stai'), ('come', 'stai', '?')]

In [59]:
def sentence_similarities(sentence: str, return_scores=False):

  sentence_trigram = list(nltk.trigrams(nltk.word_tokenize(sentence)))
  sentence_trigram = [' '.join(s) for s in sentence_trigram]

  sims = set()
  word_sim = {}
  for i, sent_emb in enumerate(sentence_trigram):
    query_sentence_embeddings = embed(sent_emb)
    similarities = F.cosine_similarity(query_sentence_embeddings, embedding_sentencies_sorianesi, dim=1)
    
    is_similar = (similarities > 0.4)
    for i, _is_similar in enumerate(is_similar):
        if not _is_similar: continue
        
        # print(query_sentence_embeddings.shape, sent_emb, similarities[i], dictionary[index])
        word = dictionary[i]
        word = json.dumps(word)
        word_sim[word] = similarities[i]
        sims.add(word)
  for k,v in word_sim.items():
     print(v, k)
  return sims

return_scores = True
for sentence in sentences:
  print(sentence)
  out = sentence_similarities(sentence, return_scores)

  print('-'*200)

sentencess = ['Ho visto Anna andare a funghi con lo zio peppe', "andare"]
sentence_trigram = list(nltk.trigrams(nltk.word_tokenize(sentencess[0])))

sentence_trigram = [' '.join(s) for s in sentence_trigram]

embeddings = embed(sentence_trigram)
embeddings1 = embed("andare")
similarities = F.cosine_similarity(embeddings, embeddings1, dim=1)
#print(sentences)
idx = (similarities > 0.4 ).nonzero()
#print(idx.flatten().tolist())
similarities

Ho visto Anna andare a funghi con lo zio peppe
tensor(0.4118) {"it": "eccola", "dial": "estela", "notes": ""}
tensor(0.4118) {"it": "eccola", "dial": "ejela", "notes": ""}
tensor(0.4023) {"it": "bella donna", "dial": "patacca", "notes": ""}
tensor(0.4292) {"it": "andana", "dial": "letta", "notes": "striscia di terreno coltivabile limitato da due filari di alberi o di piante"}
tensor(0.4552) {"it": "andare", "dial": "ann\u00e0", "notes": ""}
tensor(0.4552) {"it": "andare", "dial": "gn\u00e0", "notes": ""}
tensor(0.4552) {"it": "andare", "dial": "nn\u00e0", "notes": ""}
tensor(0.4552) {"it": "andare", "dial": "ji", "notes": "(Tocca ji fo' = Bisogna andare in campagna; pres. io vajo; part. pass. esso gnava; pass. rem. nui gnemma)"}
tensor(0.4552) {"it": "andare", "dial": "v\u00e8ne", "notes": ""}
tensor(0.5325) {"it": "castagne secche", "dial": "mosciarelle", "notes": ""}
tensor(0.4305) {"it": "", "dial": "frangetole", "notes": "erba selvatica a ciuffi (commestibile)"}
tensor(0.5406) {"it

tensor([0.2140, 0.3851, 0.4552, 0.3706, 0.2656, 0.2411, 0.1060, 0.1218])

# Ignore the follwing

## Ignore, just test

```
# This is formatted as code
```



In [None]:
from sentence_transformers import SentenceTransformer, util
import re

# 1. Carica un modello pre-addestrato adatto all'italiano
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Supporta l'italiano

# 2. Funzione per preparare le voci del dizionario
def prepare_dictionary_entry(entry):
    # Combina i campi per creare un contesto ricco
    context = ""
    if 'it' in entry and entry['it']:
        context += entry['it'] + " "
    if 'dial' in entry and entry['dial']:
        context += entry['dial'] + " "
    if 'notes' in entry and entry['notes']:
        context += entry['notes']
    return context.strip()

# 3. Funzione per generare finestre di contesto dalla frase
def generate_context_windows(sentence, window_size=3):
    words = re.findall(r'\w+', sentence.lower())
    windows = []

    # Genera finestre di parole per preservare il contesto locale
    for i in range(len(words)):
        start = max(0, i - window_size // 2)
        end = min(len(words), i + window_size // 2 + 1)
        window = ' '.join(words[start:end])
        windows.append(window)

    # Aggiungi anche l'intera frase per catturare il contesto globale
    windows.append(sentence.lower())

    return windows

# 4. Funzione principale per trovare voci pertinenti
def find_relevant_dictionary_entries(sentence, dictionary, similarity_threshold=0.5):
    # Prepara le finestre di contesto dalla frase
    context_windows = generate_context_windows(sentence)

    # Codifica le finestre di contesto
    context_embeddings = model.encode(context_windows, convert_to_tensor=True)

    # Prepara le voci del dizionario
    dictionary_texts = [prepare_dictionary_entry(entry) for entry in dictionary]

    # Codifica le voci del dizionario
    dictionary_embeddings = model.encode(dictionary_texts, convert_to_tensor=True)

    # Calcola la similarità tra ogni finestra di contesto e ogni voce del dizionario
    matches = []
    for i, entry in enumerate(dictionary):
        # Calcola la massima similarità tra qualsiasi finestra di contesto e questa voce
        similarities = util.pytorch_cos_sim(context_embeddings, dictionary_embeddings[i])
        max_similarity = float(similarities.max())

        if max_similarity >= similarity_threshold:
            matches.append({
                'entry': entry,
                'similarity': max_similarity
            })

    # Ordina i risultati per similarità decrescente
    matches.sort(key=lambda x: x['similarity'], reverse=True)

    return matches
# La tua frase di esempio
sentence = "Ho visto anna andare a funghi con lo zio peppe"

# Le tue voci di dizionario di esempio
dictionary_entries = [
    {'it': 'campicello', 'dial': "fo'", 'notes': "(E' ito fò (o de fò) = E' andato in campagna)"},
    {'it': 'ghirlanda', 'dial': 'serta', 'notes': '(detto di agli e cipolle)'},
    {'it': 'pendere', 'dial': 'pènna', 'notes': "(a settembre i'll'uva è fatta e la fico penne) - A PENNA = Penzoloni"},
    {'it': 'qua', 'dial': "cca'", 'notes': '(veni a ccà = vieni qua)'},
    {'it': 'andare', 'dial': "anna'", 'notes': "(v. anche 'NA - GNA' - JI')"}
]

# Trova le voci pertinenti
relevant_entries = find_relevant_dictionary_entries(sentence, dictionary_entries, similarity_threshold=0.55)

# Mostra i risultati
for match in relevant_entries:
    print(f"Similarità: {match['similarity']:.4f} - Voce: {match['entry']}")


In [None]:
!pip install -U "huggingface_hub[cli]" > /dev/null
!pip install sentence_transformers~=2.2.2 > /dev/null
!huggingface-cli login --token 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

## A try with E5 sentence embedding

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = [ "query: how much protein should a female eat",
               "passage: I shitted my pants.",
               ]

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2', )
model = AutoModel.from_pretrained('intfloat/e5-large-v2')

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
print(embeddings.shape)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())
F.cosine_similarity(embeddings[0], embeddings[1], dim=0)




## Try to use minerva as sentence embedder

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # Get token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('sapienzanlp/Minerva-3B-base-v1.0').to('cuda')
model = AutoModel.from_pretrained('sapienzanlp/Minerva-3B-base-v1.0').to('cuda')


# tokenizer.pad_token = tokenizer.eos_token
# Sentences for embedding
sentences = ['Sei andato a correre al parco', 'Mi sono cagato addosso']

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=False, truncation=True, return_tensors='pt')

# Get token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Apply mean pooling to get sentence embeddings
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize the embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print(sentence_embeddings.shape)
cos = F.cosine_similarity(sentence_embeddings[0], sentence_embeddings[1], dim=0)

# Output the embeddings
print(cos)


In [None]:
from sentence_transformers import SentenceTransformer

# Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode([
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
])
similarities = model.similarity(embeddings, embeddings)

## DIO CANE

In [None]:
# init embedding
embedding = TransformerDocumentEmbeddings('sapienzanlp/Minerva-350M-base-v1.0')

# create a sentence
sentence1 = Sentence('Sto mangiando una mela'.lower())
sentence2 = Sentence('Mi sono cagato addosso'.lower())

# embed words in sentence
embedding.embed(sentence1)
embedding.embed(sentence2)
# Due tensori di esempio
a = sentence1.embedding
b = sentence2.embedding

print(a.shape, b.shape)

# Calcolo della cosine similarity
cos_sim = F.cosine_similarity(a, b, dim=0)

print(cos_sim)  # → 1.0

In [None]:
sentence = Sentence('Ho dato da mangiare ai maiali versando il mangime nel recipiente')

embedding.embed(sentence)

threshold = 0.85

similarities = 0
for entry in dictionary:
  s = Sentence((entry['it'] + ' ' + entry['notes']).lower())
  embedding.embed(s)
  cosine = F.cosine_similarity(sentence.embedding, s.embedding, dim=0)
  if cosine >= threshold:
    similarities += 1
    print(cosine, entry)
similarities

## test a caso

In [None]:
from sentence_transformers import SentenceTransformer

# Load a sentence-transformer model
model = SentenceTransformer('microsoft/Multilingual-MiniLM-L12-H384')  # This supports Italian

# Create embeddings
embedding1 = model.encode('Io e la mia famiglia abbiamo mangiato la frutta dal recipiente')
embedding2 = model.encode('MI sono cagato addosso')

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([embedding1], [embedding2])[0][0]
print(similarity)  # This should give a much lower value
del model