In [1]:
from scipy.stats import pearsonr
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.metrics import jaccard_distance
import csv
from nltk import ne_chunk,pos_tag,word_tokenize
from nltk.chunk import tree2conllstr
from nltk.tree import Tree
from copy import deepcopy
from nltk.parse import CoreNLPParser

In [2]:
trial_path = 'data/trial/STS.input.txt'
trial_gs_path = 'data/trial/STS.gs.txt'
trial_df = pd.read_csv(trial_path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
trial_gs = pd.read_csv(trial_gs_path, sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)

In [3]:
def preprocessing(data, ne_parser_function):
    # To see the effect of the desambiguation we do a preprocess only with it. Generaly we will use a more complete 
    # preprocessing function. 
    data = data.fillna('')
    first = lambda x: [a[0] for a in x]
    for column in data.columns:
        data[column] = data[column].apply(ne_parser_function)
    return data

In [4]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1. - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess

def analyzeResults(results):
    guess_lex = lexical_simmilarity(results)
    pearson    = pearsonr(trial_gs['labels'], guess_lex['labels'])[0]
    print(guess_lex)
    print(pearson)

In [9]:
def na_parser(tree):
    parsed_array = []
    for chunk in tree: 
        if type(chunk) == Tree:
            word = ' '.join(leaf[0] for leaf in chunk)
            parsed_array.append(word.lower())
        else: 
            parsed_array.append(chunk[0].lower())
    
    return [i for i in parsed_array if i.isalnum()]

In [6]:
parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
# java -mx4g -cp C:\stanford-corenlp-full-2018-10-05\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

def nlp_parser(tagged):
    parsed     = []
    last_tag   = None
    start_index = 0
    for index, node in enumerate(tagged):
        tag = node[1]
        if (tag == 'O' or tag != last_tag) and (start_index != index):
            token = ' '.join([pair[0].lower() for pair in tagged[start_index:index]])
            if token.isalnum():
                parsed.append(token)
            last_tag = tag
            start_index = index
    return parsed


In [10]:
results    = preprocessing(deepcopy(trial_df), lambda x : na_parser(ne_chunk(pos_tag(word_tokenize(x)), binary=True)))
analyzeResults(results)

       labels
id1  0.272727
id2  0.250000
id3  0.636364
id4  0.400000
id5  0.090909
id6  0.107143
-0.40498149435177305


In [8]:
results    = preprocessing(deepcopy(trial_df), lambda x : nlp_parser(parser.tag(word_tokenize(x))))
analyzeResults(results)

       labels
id1  0.272727
id2  0.181818
id3  0.636364
id4  0.400000
id5  0.090909
id6  0.107143
-0.34327956222578704
