In [1]:
from scipy.stats import pearsonr
import os
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.metrics import jaccard_distance
from sklearn.neural_network import MLPRegressor
import csv

## Statement
- Use data set and description of task Semantic Textual Similarity in SemEval 2012.
- Implement some approaches to detect paraphrase using sentence similarity metrics.
    + Explore some lexical dimensions. (Only word)
    + Explore the syntactic dimension alone. (Word respect to sentence)
    + Explore the combination of both previous.
- Add new components at your choice (optional).
- Compare and comment the results achieved by these approaches among them and among the official results.
- Send files to raco in IHLT STS Project before the oral presentation:
    + Jupyter notebook: sts-[Student1]-[Student2].ipynb
    + Slides: sts-[Student1]-[Student2].pdf


In [2]:
train_path = 'data/train/'
test_path = 'data/test-gold/'

def load_and_concat(data_path):
    files = os.listdir(data_path)
    all_data = pd.DataFrame(columns=['sentence0','sentence1'])
    all_labels = pd.DataFrame(columns=['labels'])
    for file in files: 
        path = data_path + file
        if 'input' in file:
            print(path)
            fd = pd.read_csv(path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
            all_data = all_data.append(fd)
            fd = pd.read_csv(path.replace('input','gs'), sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)
            all_labels = all_labels.append(fd)
    return all_data, all_labels 

train_df, train_gs = load_and_concat(train_path)
test_df, test_gs = load_and_concat(test_path)

train_df.shape, train_gs.shape,test_df.shape, test_gs.shape

data/train/STS.input.MSRpar.txt
data/train/STS.input.MSRvid.txt
data/train/STS.input.SMTeuroparl.txt
data/test-gold/STS.input.MSRpar.txt
data/test-gold/STS.input.MSRvid.txt
data/test-gold/STS.input.SMTeuroparl.txt
data/test-gold/STS.input.surprise.SMTnews.txt
data/test-gold/STS.input.surprise.OnWN.txt


((2234, 2), (2234, 1), (3108, 2), (3108, 1))

In [27]:
train_df.head()

Unnamed: 0,sentence0,sentence1
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....


In [3]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

def preprocessing(data, return_array = False):
    # todo: better handling of na
    data = data.fillna('')
    for column in data.columns:
        print(column)
        # remove the digits and puntuation
        data[column] = data[column].str.replace('\d+', '')
        # convert to lowercase
        data[column] = data[column].str.replace('\W+', ' ')
        # replace continuous white spaces by a single one
        data[column] = data[column].str.replace('\s+', ' ')
        # words to lower
        data[column] =data[column].str.lower()
        # lematize
        data[column] = data[column].apply(lemmatize_text)
        if not return_array:
            data[column] = data[column].str.join(' ')
    return data

In [4]:
train_df = preprocessing(train_df)
train_df.head()

sentence0
sentence1


Unnamed: 0,sentence0,sentence1
0,but other source close to the sale said vivend...,but other source close to the sale said vivend...
1,micron ha declared it first quarterly profit f...,micron s number also marked the first quarterl...
2,the fine are part of failed republican effort ...,perry said he back the senate s effort includi...
3,the american anglican council which represents...,the american anglican council which represents...
4,the tech loaded nasdaq composite rose point to...,the technology laced nasdaq composite index ix...


In [5]:
test_df = preprocessing(test_df)
test_df.head()

sentence0
sentence1


Unnamed: 0,sentence0,sentence1
0,the problem likely will mean corrective change...,he said the problem need to be corrected befor...
1,the technology laced nasdaq composite index ix...,the broad standard poor s index spx inched up ...
2,it s a huge black eye said publisher arthur oc...,it s a huge black eye arthur sulzberger the ne...
3,sec chairman william donaldson said there is a...,i think there s a building confidence that the...
4,vivendi share closed percent at euro in paris ...,in new york vivendi share were percent down at


### Lexical 

In [11]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1 - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess

guess_lex = lexical_simmilarity(train_df)
guess_lex.head()

Unnamed: 0,labels
0,0.533333
1,0.388889
2,0.333333
3,0.607143
4,0.227273


## TfidVectorizer

In [6]:
import nltk, string

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train_df['sentence1']) + list(train_df['sentence0']) )

def return_simil(a,b):
    simil = tfv.transform([a,b])
    return ((simil * simil.T).A)[0,1]

def calculate_all_sims(df):
    results = []
    for i in df.values:
        results.append(return_simil(i[0], i[1]))
    return results


all_sims = calculate_all_sims(train_df)

print('train pearson: ', pearsonr(all_sims, train_gs['labels'])[0])

test_sims = calculate_all_sims(test_df)
print('test pearson:', pearsonr(test_sims, test_gs['labels'])[0])

train pearson:  0.500911988729081
test pearson: 0.5427591274375705


## Neural Networks

In [7]:
def test_model(model,xtrain,xtest):
    train_predicted =  model.predict(xtrain)
    test_predicted =   model.predict(xtest)
    print('train pearson: ', pearsonr(train_predicted, train_gs['labels'])[0])
    print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])

In [8]:
merged_sentences = train_df['sentence0'] + train_df['sentence1']
merged_test = test_df['sentence0'] + test_df['sentence1']

vectorizer = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
merged_train = vectorizer.fit_transform(merged_sentences)
merged_test = vectorizer.transform(merged_test)

In [10]:
model_nn = MLPRegressor(hidden_layer_sizes=(2,),validation_fraction=0.3, alpha=0.3,warm_start=False)
model_nn.fit(merged_train,train_gs)

  y = column_or_1d(y, warn=True)


MLPRegressor(activation='relu', alpha=0.3, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(2,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.3,
       verbose=False, warm_start=False)

In [11]:
test_model(model_nn,merged_train,merged_test)

train pearson:  0.9382510869439981
test pearson:  0.3226937926750264


In [12]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [46]:
parameters = {'alpha': 10.0 ** -np.arange(0, 5), 'max_iter':[200],
              'hidden_layer_sizes':np.arange(1, 5),'solver': ['lbfgs','adam']}
nn_cv = GridSearchCV(MLPRegressor(), parameters, n_jobs=-1)
nn_cv.fit(merged_train,train_gs.values.ravel())
test_model(nn_cv,merged_train,merged_test)





GridSearchCV(cv=None, error_score='raise',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'solver': ['lbfgs', 'adam'], 'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04]), 'hidden_layer_sizes': array([1, 2, 3, 4]), 'max_iter': [200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [47]:
test_model(nn_cv,merged_train,merged_test)

train pearson:  0.9954127015006783
test pearson:  0.5332380904341681


## Random Forest

In [14]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_jobs=-1)
rfr.fit(merged_train,train_gs.values.ravel())
test_model(rfr,merged_train,merged_test)

train pearson:  0.9509229213640846
test pearson:  0.5236885493222956


## SVM

In [15]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(merged_train,train_gs.values.ravel())
test_model(rfr,merged_train,merged_test)

train pearson:  0.9509229213640846
test pearson:  0.5236885493222957


## Same with another representation of the data

In [50]:
def transform_data(train,test):
    vectorizer = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    vectorizer.fit(list(train['sentence1']) + list(train['sentence0']) )

    values_0 = vectorizer.transform(train['sentence0']).todense()
    values_1= vectorizer.transform(train['sentence1']).todense()
    train_for_model = np.concatenate((values_0, values_0), axis=1)
    
    values_0 = vectorizer.transform(test['sentence0']).todense()
    values_1= vectorizer.transform(test['sentence1']).todense()
    test_for_model = np.concatenate((values_0, values_0), axis=1)
    
    return train_for_model,test_for_model

new_train,new_test = transform_data(train_df,test_df)
new_train.sum()

19871.64922737047

In [51]:
model_nn.fit(new_train,train_gs)
test_model(model_nn,new_train,new_test)

train pearson:  0.9387026533266285
test pearson:  0.2732521284836814


In [None]:
nn_cv = GridSearchCV(MLPRegressor(), parameters, n_jobs=-1)
nn_cv.fit(new_train,train_gs.values.ravel())
test_model(nn_cv,new_train,new_test)

Exception ignored in: <object repr() failed>
Error in sys.excepthook:
Traceback (most recent call last):


Original exception was:
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt
Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt
Exception ignored in: <function WeakValueDictionary.__init__.<locals>.remove at 0x7f6f9c5d7d08>
Traceback (most recent call last):
  File "/usr/lib/python3.5/weakref.py", line 108, in remove
    def remove(wr, selfref=ref(self)):
KeyboardInterrupt
