In [155]:
import pandas as pd
import csv
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.metrics import jaccard_distance
from scipy.stats import pearsonr
from nltk.corpus import wordnet as wn
from nltk import pos_tag

In [156]:
trial_path = 'trial/STS.input.txt'
trial_gs_path = 'trial/STS.gs.txt'
trial_df = pd.read_csv(trial_path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
trial_gs = pd.read_csv(trial_gs_path, sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)
print(trial_df.shape, trial_gs.shape)

(6, 2) (6, 1)


In [157]:
def morphy_tag(nltk_tag):

    if nltk_tag.startswith('J'):
        return wn.ADJ
    elif nltk_tag.startswith('V'):
        return wn.VERB
    elif nltk_tag.startswith('N'):
        return wn.NOUN
    elif nltk_tag.startswith('R'):
        return wn.ADV
    else:
        return wn.NOUN

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    tokenized_text = w_tokenizer.tokenize(text)
    list_tags = pos_tag(tokenized_text)
    tags = {w:morphy_tag(tag) for w,tag in list_tags}
    return [lemmatizer.lemmatize(w, tags[w]) for w in tokenized_text]

def apply_lesk_to_text(text):    
    return [lesk(text, word) for word in text]

def preprocessing(data):
    # todo: better handling of na
    data = data.fillna('')
    for column in data.columns:
        print(column)
        # remove the digits and puntuation
        data[column] = data[column].str.replace('\d+', '')
        # convert to lowercase
        data[column] = data[column].str.replace('\W+', ' ')
        # replace continuous white spaces by a single one
        data[column] = data[column].str.replace('\s+', ' ')
        # words to lower
        data[column] = data[column].str.lower()
        # lematize
        data[column] = data[column].apply(lemmatize_text)
        # data[column] = data[column].str.split()
        # desambiguate 
        data[column] = data[column].apply(apply_lesk_to_text)

    return data

In [158]:
a = 'birdie is washing itself on the sink basine'
lemmatize_text(a)

['birdie', 'be', 'wash', 'itself', 'on', 'the', 'sink', 'basine']

In [159]:
trial_df = preprocessing(trial_df)
trial_df

sentence0
sentence1


Unnamed: 0,sentence0,sentence1
id1,"[None, Synset('bird.n.02'), Synset('exist.v.01...","[Synset('birdie.v.01'), Synset('exist.v.01'), ..."
id2,"[Synset('in.r.01'), Synset('whitethorn.n.01'),...","[None, Synset('u.s.01'), Synset('united_states..."
id3,"[Synset('whoremaster.n.01'), Synset('suppose.v...","[Synset('helium.n.01'), Synset('be.v.03'), Syn..."
id4,"[None, Synset('fly.v.12'), Synset('out.s.04'),...","[None, Synset('fly.v.13'), None, None, Synset(..."
id5,"[None, Synset('woman.n.02'), Synset('exist.v.0...","[None, Synset('young.n.08'), Synset('lady.n.03..."
id6,"[Synset('toilet.n.01'), Synset('plump.v.04'), ...","[Synset('sunrise.s.01'), Synset('at.n.02'), Sy..."


In [160]:
trial_df.loc['id1','sentence0']

[None,
 Synset('bird.n.02'),
 Synset('exist.v.01'),
 Synset('bath.n.05'),
 Synset('indium.n.01'),
 None,
 Synset('sinkhole.n.01')]

In [161]:
wn.synsets('was')[0]

Synset('washington.n.02')

In [162]:
trial_df.loc['id1','sentence1']

[Synset('birdie.v.01'),
 Synset('exist.v.01'),
 Synset('washout.n.02'),
 None,
 Synset('indium.n.01'),
 None,
 Synset('water.n.03'),
 Synset('washbasin.n.01')]

In [163]:
lesk(trial_df.loc['id1','sentence0'], 'bird')

Synset('shuttlecock.n.01')

In [170]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1 - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess


guess_lex = lexical_simmilarity(trial_df)
guess_lex.head()

Unnamed: 0,labels
id1,0.3
id2,0.133333
id3,0.636364
id4,0.111111
id5,0.1


In [171]:
print(pearsonr(trial_gs['labels'], guess_lex['labels'])[0])

-0.42733300435366206


In [166]:
trial_gs

Unnamed: 0,labels
id1,0
id2,1
id3,2
id4,3
id5,4
id6,5


In [167]:
trial_df.to_csv('potato.csv')