In [114]:
from scipy.stats import pearsonr
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.metrics import jaccard_distance
import csv

## Statement
- Use data set and description of task Semantic Textual Similarity in SemEval 2012.
- Implement some approaches to detect paraphrase using sentence similarity metrics.
    + Explore some lexical dimensions.
    + Explore the syntactic dimension alone.
    + Explore the combination of both previous.
- Add new components at your choice (optional).
- Compare and comment the results achieved by these approaches among them and among the official results.
- Send files to raco in IHLT STS Project before the oral presentation:
    + Jupyter notebook: sts-[Student1]-[Student2].ipynb
    + Slides: sts-[Student1]-[Student2].pdf


In [115]:
train_path = 'data/train/STS.input.MSRpar.txt'
train_gs_path = 'data/train/STS.gs.MSRpar.txt'
test_path = 'data/test-gold/STS.input.MSRpar.txt'
test_gs_path = 'data/test-gold/STS.gs.MSRpar.txt'
train_df = pd.read_csv(train_path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
train_gs = pd.read_csv(train_gs_path, sep='\t', lineterminator='\n', names=['labels'], header=None)
print(train_df.shape, train_gs.shape)

(750, 2) (750, 1)


In [116]:
train_df

Unnamed: 0,sentence0,sentence1
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....
5,"Amgen shares gained 93 cents, or 1.45 percent,...",Shares of Allergan were up 14 cents at $78.40 ...
6,U.S. prosecutors have arrested more than 130 i...,More than 130 people have been arrested and $1...
7,Chavez said investigators feel confident they'...,Albuquerque Mayor Martin Chavez said investiga...
8,Authorities said the scientist properly quaran...,The scientist also quarantined himself at home...
9,The support will come as a free software upgra...,The upgrade will be available as a free downlo...


In [117]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [118]:
def preprocessing(data):
    # todo: better handling of na
    data = data.fillna('')
    for column in data.columns:
        print(column)
        # remove the digits and puntuation
        data[column] = data[column].str.replace('\d+', '')
        # convert to lowercase
        data[column] = data[column].str.replace('\W+', ' ')
        # replace continuous white spaces by a single one
        data[column] = data[column].str.replace('\s+', ' ')
        # words to lower
        data[column] =data[column].str.lower()
        # lematize
        data[column] = data[column].apply(lemmatize_text)
    return data

In [119]:
train_df = preprocessing(train_df)
train_df.head()

sentence0
sentence1


Unnamed: 0,sentence0,sentence1
0,"[but, other, source, close, to, the, sale, sai...","[but, other, source, close, to, the, sale, sai..."
1,"[micron, ha, declared, it, first, quarterly, p...","[micron, s, number, also, marked, the, first, ..."
2,"[the, fine, are, part, of, failed, republican,...","[perry, said, he, back, the, senate, s, effort..."
3,"[the, american, anglican, council, which, repr...","[the, american, anglican, council, which, repr..."
4,"[the, tech, loaded, nasdaq, composite, rose, p...","[the, technology, laced, nasdaq, composite, in..."


In [120]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1 - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess

guess_lex = lexical_simmilarity(train_df)
guess_lex.head()

Unnamed: 0,labels
0,0.533333
1,0.388889
2,0.333333
3,0.607143
4,0.227273


In [121]:
print(pearsonr(refs, guess_lex['labels'])[0])

0.46821213024625113
