## Bibliotecas para ambos algoritmos

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw

import string

nltk.download('stopwords')
sw_engilsh = sw.words('english')
print(sw_engilsh)

test = "Yesterday I went to the bank to withdraw the money and the credit card not work"



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FX607\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stop_words = set(sw_engilsh)


def remove_stopwords (words: list[str]) -> list[str]:
    result = []
    for word in words:
        if word.lower() not in stop_words and word not in string.punctuation:
            result.append(word)
    return result

def process_sentence(sentence: str) -> list[str]:
    return remove_stopwords(word_tokenize(sentence))

print(process_sentence(test))

['Yesterday', 'went', 'bank', 'withdraw', 'money', 'credit', 'card', 'work']


## Ejercicio 1

In [8]:


def simplified_lesk(word: str, sentence: str) -> str:
    
    best_sense = None
    max_overlap = 0
    context = set(process_sentence(sentence))
    
    for sense in wn.synsets(word):
        definition = sense.definition()
        examples = sense.examples()
        
        signature = process_sentence(definition)
        
        for example in examples:
            signature.extend(process_sentence(example))
        
        overlap = len(context.intersection(signature))

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    
    return best_sense


simplified_lesk('bank', "Yesterday I went to the bank to withdraw the money and the credit card not work")


Synset('depository_financial_institution.n.01')

## Ejercicio 2
Hace la media de las distancias coseno entre todas las palabras del contexto y de la firma. No cuenta para la media las apariciones de la propia palabra que se esta buscando, porque no aporta información y suma mucha similaridad.

In [7]:
import gensim
from nltk.data import find

# Cargar el modelo de embeding pre-entrenados del NLTK
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [25]:
def cos_semantic(word: str, sentence: str) -> str:
    
    best_sense = None
    max_similarity = 0
    context = process_sentence(sentence)
    
    for sense in wn.synsets(word):
        
        definition = sense.definition()
        examples = sense.examples()
        
        signature = process_sentence(definition)
        
        for example in examples:
            signature.extend(process_sentence(example))
        
        similarity = 0
        not_founds = 0
        for s_word in signature:
            for c_word in context:
                try:
                    if s_word == word or c_word == word: raise Exception
                    similarity += model.similarity(s_word, c_word) # cos dist
                except Exception as e:
                    not_founds += 1
        similarity /= len(signature) * len(context) - not_founds

        if similarity > max_similarity:
            max_similarity = similarity
            best_sense = sense
    
    return best_sense


cos_semantic("bank", test)

Synset('deposit.v.02')

In [23]:
wn.synset('deposit.v.02').definition()

'put into a bank account'