In [1]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk import pos_tag, word_tokenize

In [72]:
# Cargar la información de contenido de WordNet
brown_ic = wordnet_ic.ic('ic-brown.dat')

def calculate_jcn_similarity(word1, word2):
    # Obtener los synsets para cada palabra
    synsets_word1 = wn.synsets(word1, pos=wn.NOUN)
    synsets_word2 = wn.synsets(word2, pos=wn.NOUN)

    max_similarity = -1
    best_synset_word1 = None
    best_synset_word2 = None

    # Calcular la similitud JCN entre los synsets de las dos palabras
    for synset1 in synsets_word1:
        for synset2 in synsets_word2:
            similarity = synset1.jcn_similarity(synset2, brown_ic)
            if similarity > max_similarity:
                max_similarity = similarity
                best_synset_word1 = synset1
                best_synset_word2 = synset2
                
    if best_synset_word1 is None : best_synset_word1 = word1
    if best_synset_word2 is None : best_synset_word2 = word2
    
    return best_synset_word1, best_synset_word2, max_similarity

In [97]:
def disambiguateText(text, verbose_ = False):
    # Tokenizado
    tokens = word_tokenize(text)
    # Etiquetado y obtención de substantivos
    pos_tags = pos_tag(tokens)
    if verbose_: print(pos_tags)
    nouns = [word for word, tag in pos_tags if ((tag == 'NN' or tag == ' NNS') and len(wn.synsets('word', pos=wn.NOUN)) > 0)]
    if verbose_: print('Nouns to Disambiguate: ', nouns)
    #Inicializamos variables de apoyo
    last_noun = None
    last_sim = -1
    toret_nouns = []
    #Desambiguación del texto
    for i in range(len(nouns)):
        if i < len(nouns)-1:
            if verbose_: print('Disambiguate: ',nouns[i],' vs ',nouns[i+1])
            first_noun, second_noun, similarity = calculate_jcn_similarity(nouns[i], nouns[i+1])
            if verbose_: print(first_noun, ' ', second_noun, ' ', similarity, ' ', last_sim)
            if similarity >= last_sim:
                toret_nouns.append(first_noun)
            else:
                toret_nouns.append(last_noun)
            last_sim = similarity
            last_noun = second_noun
        else:
            second_noun, first_noun, similarity = calculate_jcn_similarity(nouns[i], nouns[i-1])
            toret_nouns.append(second_noun)
            if verbose_: print(second_noun, ' ', first_noun)
    if verbose_:
        print(toret_nouns)
        for i in toret_nouns:
            if type(i) is str:
                print(i)
            else:
                print(i.name(),'\n',i.definition())
    # Substitución de las palabras desambiguadas en el texto

    for i in range(len(nouns)):
        index = tokens.index(nouns[i])
        if type(toret_nouns[i]) is str:
            tokens[index] = toret_nouns[i]
        else:
            tokens[index] = toret_nouns[i].name()
    if verbose_: print(tokens)        
    return ' '.join(tokens)

In [115]:
import pandas as pd
import re
from nltk.corpus import stopwords

df = pd.read_json('expanded_data.json')
stop_words = set(stopwords.words('english'))
disambiguatedText = df['expandedText'].apply(lambda x: ' '.join([word.lower() for word in x.split() if word.lower() not in stop_words]))
disambiguatedText = disambiguatedText.apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
disambiguatedText = disambiguatedText.apply(disambiguateText)
disambiguatedText = disambiguatedText.apply(lambda x: re.sub(r'\.n\.', '_n_', x))
disambiguatedText

0       571 main page_n_03 computer_n_01 science_n_01 ...
1       object_n_01 oriented programming alumnus_n_01 ...
2       ece carbon_n_01 752 spring_n_04 1996 ece vitam...
3       last modified thursday 01 february_n_01 96 22 ...
4       eecs401 web_n_01 page_n_01 fall_n_05 96 welcom...
                              ...                        
6618    home_page_n_01 daqing lithium_n_01 welcome wor...
6619    last modified sunday 25 aug 96 22 35 51 gmt pe...
6620    home marla baker_n_01 marla carbon_n_01 capita...
6621    patrice caire patrice caire virtual environmen...
6622    eric sulfur_n_01 dwelling_n_01 page_n_01 eric ...
Name: expandedText, Length: 6623, dtype: object

In [121]:
df.to_json('disambiguated_dataTrain.json')

In [120]:
df

Unnamed: 0,id,label,text
0,aaexyuw,course,571 main page_n_03 computer_n_01 science_n_01 ...
1,abbdqt,course,object_n_01 oriented programming alumnus_n_01 ...
2,achmly,course,ece carbon_n_01 752 spring_n_04 1996 ece vitam...
3,aciio,course,last modified thursday 01 february_n_01 96 22 ...
4,ackfxrep,course,eecs401 web_n_01 page_n_01 fall_n_05 96 welcom...
...,...,...,...
6618,zxgxje,student,home_page_n_01 daqing lithium_n_01 welcome wor...
6619,zyidaxg,student,last modified sunday 25 aug 96 22 35 51 gmt pe...
6620,zyrphu,student,home marla baker_n_01 marla carbon_n_01 capita...
6621,zyvupbc,student,patrice caire patrice caire virtual environmen...
