# Semantic Textual Similarity in SemEval 2012

In [136]:
import os
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import csv

import numpy as np
from scipy.stats import pearsonr

from spellchecker import SpellChecker

import nltk, string
from nltk import pos_tag
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
from nltk.tag import PerceptronTagger
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords

import gensim
from gensim import corpora

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics.pairwise import manhattan_distances as md
from sklearn.metrics.pairwise import euclidean_distances as ed
from sklearn.metrics import jaccard_similarity_score as jsc
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor

brown_ic = wordnet_ic.ic('ic-brown.dat')

# We will use the same scaler over all the notebook
scaler = StandardScaler()


## Load and concatenate the datasets:

In [137]:
train_path = '../data/train/'
test_path = '../data/test-gold/'

def load_and_concat(data_path):
    files = os.listdir(data_path)
    all_data = pd.DataFrame(columns=['sentence0','sentence1'])
    all_labels = pd.DataFrame(columns=['labels'])
    for file in files: 
        path = data_path + file
        if 'input' in file:
            print(path)
            fd = pd.read_csv(path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
            all_data = all_data.append(fd)
            fd = pd.read_csv(path.replace('input','gs'), sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)
            all_labels = all_labels.append(fd,ignore_index=True)
    return all_data.reset_index(drop=True), all_labels.reset_index(drop=True)

# We save appart the test befor preprocessing it to look at the badly classified sentences later
original_test, test_gs = load_and_concat(test_path)


train_df, train_gs = load_and_concat(train_path)
test_df, test_gs = load_and_concat(test_path)

train_df.shape, train_gs.shape,test_df.shape, test_gs.shape

../data/test-gold/STS.input.MSRpar.txt
../data/test-gold/STS.input.MSRvid.txt
../data/test-gold/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.surprise.SMTnews.txt
../data/test-gold/STS.input.surprise.OnWN.txt
../data/train/STS.input.MSRpar.txt
../data/train/STS.input.MSRvid.txt
../data/train/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.MSRpar.txt
../data/test-gold/STS.input.MSRvid.txt
../data/test-gold/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.surprise.SMTnews.txt
../data/test-gold/STS.input.surprise.OnWN.txt


((2234, 2), (2234, 1), (3108, 2), (3108, 1))

## Apply a spell checker to train and test
As this process is very slow we saved the datasets after

In [138]:
def auto_spell(text):
    """
    This functions apply the spell checker to a sentence, ignoring the propper nouns. 
    (Which are detected using a pretrained Perceptron pos tagger)
    """
    spell = SpellChecker()
    misspelled = spell.unknown(text.split())
    tagger = PerceptronTagger()
    tagged_text = tagger.tag(text)
    corrected_text = ''
    for i, word in enumerate(text.split()):
        if word in misspelled:
            tag = tagged_text[i][1]
            if tag != 'NNP':
                word = spell.correction(word)
        corrected_text +=word +' '
    return corrected_text.strip()



def correct_dataset(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(auto_spell)
    return dataset

# corrected_train = correct_dataset(train_df)
# corrected_train.to_csv('corrected_train.csv')
# corrected_test = correct_dataset(test_df)
# corrected_test.to_csv('corrected_test.csv')

train_df = pd.read_csv('corrected_train.csv', index_col=0)
test_df = pd.read_csv('corrected_test.csv', index_col=0)

## Before the preprocessing we extract some features 


### Helper functions

In [139]:
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('V'):
        return 'v'
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('R'):
        return 'r'
    return 'n'


def lemmatize_text(text):
    """Lemmatizes the text using the perceptron Pos Tagger"""
    tagger = PerceptronTagger()
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    s_tokenized = w_tokenizer.tokenize(text)
    s_tagged = tagger.tag(s_tokenized)
    return [lemmatizer.lemmatize(w[0], penn_to_wn(w[1])) for w in s_tagged]


def tagged_to_synset(word, tag):
    """Returns the synset of the word depending on its pos tag"""
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
    
    
def sentence_lenght(s):
    return len(s.split())


def count_symbols(s):
    count = lambda l1, l2: sum([1 for x in l1 if x in l2])
    return count(s, set(string.punctuation))


def count_nouns(s0):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    NN_s0 = [values[0] for values in s0_tags if values[1] == 'NN']
    return len(NN_s0)


def count_verbs(s0):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    V_s0 = [values[0] for values in s0_tags if values[1] == 'VBP']
    return len(V_s0)


def count_digits(s):
    numbers = sum(c.isdigit() for c in s)
    return numbers

def remove_stop_words(s0):
    new_vector = ' '.join(word for word in s0.split() if word.lower() not in list(stopwords.words('english')))
    return new_vector

def _get_word_synonyms(word):
    word_synonyms = []
    for synset in wn.synsets(word):
        for lemma in synset.lemma_names():
            word_synonyms.append(lemma)
    return word_synonyms

### Feature extraction

In [140]:
def comon_stop_word_proportion(s0, s1):
    stopwords_s0 = [word.lower() for word in s0.split() if word.lower() in list(stopwords.words('english'))]
    stopwords_s1 = [word.lower() for word in s1.split() if word.lower() in list(stopwords.words('english'))]
    common = len(set(stopwords_s0) & set(stopwords_s1))
    if min(len(stopwords_s0), len(stopwords_s1)) > 0:
        return common / min(len(stopwords_s0), len(stopwords_s1))
    return 0


def calculate_jaccard(s0, s1):
    lemms_0 = [a for a in lemmatize_text(s0) if a]
    lemms_1 = [a for a in lemmatize_text(s1) if a]
    jaccard_simmilarity = (1 - jaccard_distance(set(lemms_0), set(lemms_1)))
    return jaccard_simmilarity

def count_shared_words(s0, s1):
    list3 = list(set(lemmatize_text(s0.lower())) & set(lemmatize_text(s1.lower())))
    return len(list3)


def count_common_propper_nouns(s0, s1):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    s1_tags = tagger.tag(s1.split())
    NNP_s0 = [values[0] for values in s0_tags if values[1] == 'NNP']
    NNP_s1 = [values[0] for values in s1_tags if values[1] == 'NNP']
    return len(set(NNP_s0) & set(NNP_s1))


def synonim_proportion(s0, s1):
    syn_count = 0
    for a in s0.split():
        for b in s1.split():
            if a == b:
                are_syns = 1
            else:
                are_syns = len(set(_get_word_synonyms(a)) & set(_get_word_synonyms(b))) > 0
            syn_count += are_syns
    max_len = min([len(s0.split()), len(s1.split())])
    return syn_count / max_len


def sentence_similarity(sentence1, sentence2, similarity):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(lemmatize_text(sentence1))
    sentence2 = pos_tag(lemmatize_text(sentence2))

    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]

    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

    score, count = 0.0, 0

    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        similarities = [similarity(synset, ss) for ss in synsets2 if similarity(synset, ss)]
        try:
            best_score = max(similarities)
        except:
            best_score = 0
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
    # Average the values
    try:
        score /= count
    except:
        score = 0
    return score


def sentence_similarity_information_content(sentence1, sentence2, similarity):
    ''' compute the sentence similairty using information content from wordnet '''
    # Tokenize and tag
    sentence1 = pos_tag(lemmatize_text(sentence1))
    sentence2 = pos_tag(lemmatize_text(sentence2))
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
    score, count = 0.0, 0
    ppdb_score, align_cnt = 0, 0
    # For each word in the first sentence
    for synset in synsets1:
        L = []
        for ss in synsets2:
            try:
                L.append(similarity(synset, ss, brown_ic))
            except:
                continue
        if L:
            best_score = max(L)
            score += best_score
            count += 1
    # Average the values
    if count > 0: score /= count
    return score

def feature_extractor(dataset):
    features = pd.DataFrame(columns=['sentence_0_lengh', 'sentence_1_lengh',
                                     'number_of_nouns_s0', 'number_of_nouns_s1',
                                     'number_of_verbs_s0', 'number_of_verbs_s1',
                                     'number_of_symbols_s0', 'number_of_symbols_s1',
                                     'number_of_digits_s0', 'number_of_digits_1',
                                     'synonim_proportion', 'quantity_of_shared_words',
                                     'proper_nouns_shared', 'jaccard_distance', 'path_similarity',
                                     'wup_similarity', 'comon_stop_word_proportion', 'resnik_similarity',
                                     'jcn_similarity','lin_similarity'])
    # We count the stop word proportion before remove the stopwords
    for index, row in dataset.iterrows():
        s0 = row['sentence0']
        s1 = row['sentence1']
        features.loc[index, 'comon_stop_word_proportion'] = comon_stop_word_proportion(s0, s1)
    
    # We remove the stopwords to extract better features
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(remove_stop_words)

    for index, row in dataset.iterrows():
        s0 = row['sentence0']
        s1 = row['sentence1']
        features.loc[index, 'jaccard_distance'] = calculate_jaccard(s0, s1)
        features.loc[index, 'resnik_similarity'] = sentence_similarity_information_content(s0, s1, wn.res_similarity)
        features.loc[index, 'jcn_similarity'] = sentence_similarity_information_content(s0, s1, wn.jcn_similarity)
        features.loc[index, 'lin_similarity'] = sentence_similarity_information_content(s0, s1, wn.lin_similarity)
        features.loc[index, 'path_similarity'] = sentence_similarity(s0, s1, wn.path_similarity)
        features.loc[index, 'wup_similarity'] = sentence_similarity(s0, s1, wn.wup_similarity)
        features.loc[index, 'proper_nouns_shared'] = count_common_propper_nouns(s0, s1)
        features.loc[index, 'quantity_of_shared_words'] = count_shared_words(s0, s1)
        features.loc[index, 'synonim_proportion'] = synonim_proportion(s0, s1)
        features.loc[index, 'sentence_0_lengh'] = sentence_lenght(s0)
        features.loc[index, 'sentence_1_lengh'] = sentence_lenght(s1)
        features.loc[index, 'number_of_nouns_s0'] = count_nouns(s0)
        features.loc[index, 'number_of_nouns_s1'] = count_nouns(s1)
        features.loc[index, 'number_of_verbs_s0'] = count_verbs(s0)
        features.loc[index, 'number_of_verbs_s1'] = count_verbs(s1)
        features.loc[index, 'number_of_symbols_s0'] = count_symbols(s0)
        features.loc[index, 'number_of_symbols_s1'] = count_symbols(s1)
        features.loc[index, 'number_of_digits_s0'] = count_digits(s0)
        features.loc[index, 'number_of_digits_1'] = count_digits(s1)
    # We scalate resnik similarity to avoid overflow problems on the classifiers
    features['jcn_similarity'] = scaler.fit_transform(features[['jcn_similarity']].values)
    features['resnik_similarity'] = scaler.fit_transform(features[['resnik_similarity']].values)
    return features


In [141]:
# This step may take a while
train_features = feature_extractor(train_df)
test_features = feature_extractor(test_df)

In [142]:
train_features

Unnamed: 0,sentence_0_lengh,sentence_1_lengh,number_of_nouns_s0,number_of_nouns_s1,number_of_verbs_s0,number_of_verbs_s1,number_of_symbols_s0,number_of_symbols_s1,number_of_digits_s0,number_of_digits_1,synonim_proportion,quantity_of_shared_words,proper_nouns_shared,jaccard_distance,path_similarity,wup_similarity,comon_stop_word_proportion,resnik_similarity,jcn_similarity,lin_similarity
0,16,12,2,2,0,0,0,0,0,0,0.75,9,1,0.473684,0.58545,0.681905,0.636364,-0.0,0.0,0.595079
1,7,11,1,1,0,0,0,1,0,0,0.714286,5,0,0.384615,0.690476,0.770833,0.333333,-0.0,0.0,0.8406
2,9,10,3,1,1,1,0,1,0,0,0.777778,5,0,0.357143,0.682292,0.732955,0.333333,-0.0,0.0,0.687289
3,12,17,4,5,0,0,0,0,0,0,1.16667,11,0,0.611111,0.907407,0.949495,1,-0.0,0.0,0.870491
4,12,11,4,2,0,0,4,7,13,14,0.545455,3,0,0.0952381,0.298169,0.432659,0.666667,0.0,-0.0,0.424778
5,11,11,4,4,0,0,2,1,10,7,0.727273,3,0,0.1,0.439129,0.605975,0.5,-0.0,-0.0,0.50872
6,13,17,4,6,0,0,2,3,5,5,0.538462,7,0,0.304348,0.434484,0.649232,0.555556,-0.0,-0.0,0.569663
7,12,13,1,3,1,1,1,0,0,0,0.916667,8,0,0.388889,0.725253,0.79798,0.8,-0.0,0.0,0.585809
8,11,10,1,1,0,0,2,0,2,0,0.7,5,1,0.4,0.597403,0.610902,0.8,0.0,0.0,0.666667
9,11,10,4,3,0,0,0,1,0,4,0.6,5,0,0.333333,0.610399,0.703704,0.714286,-0.0,0.0,0.554685


In [143]:
# We scalate the features
train_features_std = scaler.fit_transform(train_features)
test_features_std = scaler.fit_transform(test_features)

## Simple Preprocessing 

In [144]:
def preprocessing(data, return_array=False):
    data = data.fillna('')
    for column in data.columns:
        print(column)
        # remove the digits and puntuation
        # data[column] = data[column].str.replace('\d+', '')
        # remove stopwords
        data[column] = data[column].apply(remove_stop_words)
        # convert to lowercase
        data[column] = data[column].str.replace('\W+', ' ')
        # replace continuous white spaces by a single one
        data[column] = data[column].str.replace('\s+', ' ')
        # words to lower
        data[column] = data[column].str.lower()
        # lematize
        data[column] = data[column].apply(lemmatize_text)
        if not return_array:
            data[column] = data[column].str.join(' ')
    return data

train_df = preprocessing(train_df,return_array=False)
test_df = preprocessing(test_df,return_array=False)

sentence0
sentence1
sentence0
sentence1


In [145]:
train_df.head()

Unnamed: 0,sentence0,sentence1
0,source close sale say vivendi keep door open b...,source close sale say vivendi keep door open b...
1,micron declare first quarterly profit three year,micron s number also mark first quarterly prof...
2,fine part fail republican effort force entice ...,perry say back senate s effort include fine fo...
3,american anglican council represent episcopali...,american anglican council represent episcopali...
4,tech load nasdaq composite rise 20 96 point 15...,technology lace nasdaq composite index ixic cl...


In [146]:
test_df.head()

Unnamed: 0,sentence0,sentence1
0,problem likely mean corrective change shuttle ...,say problem need correct space shuttle fleet c...
1,technology lace nasdaq composite index ixic in...,broad standard door 500 index spx inch 3 point...
2,let s huge black eye say publisher arthur chs ...,let s huge black eye arthur sulzberger newspap...
3,sec chairman william donaldson say building co...,think three s build confidence cop beat
4,vivendi share close 1 9 percent 15 80 euro par...,new work vivendi share 1 4 percent 51829


## Explore some lexical dimensions.

### Jaccard Distance

In [147]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i, 'labels'] = 1 - jaccard_distance(set(df.loc[i, 'sentence0']), set(df.loc[i, 'sentence1']))
    return guess


guess_lex_train = lexical_simmilarity(train_df)
guess_lex_test = lexical_simmilarity(test_df)

print('train pearson: ', pearsonr(guess_lex_train['labels'], train_gs['labels'])[0])
print('test pearson: ', pearsonr(guess_lex_test['labels'], test_gs['labels'])[0])

train pearson:  0.7424601681998196
test pearson:  0.5976646227151003


In [148]:
def calculate_all_sims(dataset,symilarity_measure):
    results = []
    for index, row in dataset.iterrows():
        s0 = row['sentence0']
        s1 = row['sentence1']
        results.append(sentence_similarity(s0,s1, symilarity_measure))
    return results

def calculate_all_sims_ic(dataset,symilarity_measure):
    results = []
    for index, row in dataset.iterrows():
        s0 = row['sentence0']
        s1 = row['sentence1']
        results.append(sentence_similarity_information_content(s0,s1, symilarity_measure))
    return results


### Path

In [149]:
guess_lex_train = calculate_all_sims(train_df,wn.path_similarity)
guess_lex_test = calculate_all_sims(test_df,wn.path_similarity)

print('train pearson: ', pearsonr(guess_lex_train, train_gs['labels'])[0])
print('test pearson: ', pearsonr(guess_lex_test, test_gs['labels'])[0])

train pearson:  0.5633017937153743
test pearson:  0.5284083437429895


### Wu Palmer

In [150]:
guess_lex_train = calculate_all_sims(train_df,wn.wup_similarity)
guess_lex_test = calculate_all_sims(test_df,wn.wup_similarity)

print('train pearson: ', pearsonr(guess_lex_train, train_gs['labels'])[0])
print('test pearson: ', pearsonr(guess_lex_test, test_gs['labels'])[0])

train pearson:  0.43967804489375517
test pearson:  0.40097435468823356


### Lin

In [151]:
guess_lex_train = calculate_all_sims_ic(train_df,wn.lin_similarity)
guess_lex_test = calculate_all_sims_ic(test_df,wn.lin_similarity)

print('train pearson: ', pearsonr(guess_lex_train, train_gs['labels'])[0])
print('test pearson: ', pearsonr(guess_lex_test, test_gs['labels'])[0])

train pearson:  0.44921340347276706
test pearson:  0.39752191638094025


### Resnik Similarity

In [152]:
guess_lex_train = calculate_all_sims_ic(train_df,wn.res_similarity)
guess_lex_test = calculate_all_sims_ic(test_df,wn.res_similarity)

print('train pearson: ', pearsonr(guess_lex_train, train_gs['labels'])[0])
print('test pearson: ', pearsonr(guess_lex_test, test_gs['labels'])[0])

train pearson:  0.0
test pearson:  0.0


##  Explore the syntactic dimension alone.

In the preprocessing

## Explore the combination of both previous.

In the preprocessing and in the feature extractor

## Bag of Words 

In [153]:
def train_dictionary(df):
    
    sentences_tokenized = df.sentence0.tolist() + df.sentence1.tolist()
    
    dictionary = corpora.Dictionary(sentences_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary
    
def get_vectors(df, dictionary):
    
    sentence0_vec = [dictionary.doc2bow(text) for text in df.sentence0.tolist()]
    sentence1_vec = [dictionary.doc2bow(text) for text in df.sentence1.tolist()]
    
    sentence0_csc = gensim.matutils.corpus2csc(sentence0_vec, num_terms=len(dictionary.token2id))
    sentence1_csc = gensim.matutils.corpus2csc(sentence1_vec, num_terms=len(dictionary.token2id))
    
    return sentence0_csc.transpose(),sentence1_csc.transpose()

tokenized_train = preprocessing(train_df, return_array = True)
dictionary = train_dictionary(tokenized_train)
print ("No of words in the dictionary = %s" %len(dictionary))

tokenized_test = preprocessing(test_df, return_array = True)

q1_csc, q2_csc = get_vectors(tokenized_train, dictionary)
q1_csc_test, q2_csc_test = get_vectors(tokenized_test, dictionary)

print (q1_csc.shape)
print (q1_csc_test.shape)

train_bog = np.concatenate((q1_csc.todense(), q2_csc.todense()), axis=1)
test_bog = np.concatenate((q1_csc_test.todense(), q2_csc_test.todense()), axis=1)


sentence0
sentence1
No of words in the dictionary = 1783
sentence0
sentence1
(2234, 1783)
(3108, 1783)


## Add the feature information to the bag of words representation

In [154]:
train_bog_extended = pd.concat([pd.DataFrame(train_bog),train_features],axis=1)
test_bog_extended = pd.concat([pd.DataFrame(test_bog),test_features],axis=1)

train_bog_extended_std = pd.concat([pd.DataFrame(train_bog),pd.DataFrame(train_features_std,columns=train_features.columns)],axis=1)
test_bog_extended_std = pd.concat([pd.DataFrame(test_bog),pd.DataFrame(test_features_std,columns=test_features.columns)],axis=1)

## NN

In [155]:
model_nn = MLPRegressor(hidden_layer_sizes=(100,100),validation_fraction=0.3, alpha=0.3,warm_start=False,max_iter=1000)
model_nn.fit(train_bog_extended_std,train_gs['labels'])

test_predicted = model_nn.predict(test_bog_extended_std)
print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])

test pearson:  0.5444314437398093


## Random Forest Regressor

In [158]:
rfr = RandomForestRegressor(n_jobs=-1,n_estimators=1000)
rfr.fit(train_bog_extended,train_gs['labels'])

test_predicted = rfr.predict(test_bog_extended)
print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])

test pearson:  0.7178333373880817
