In [461]:
from scipy.stats import pearsonr
import os
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import nltk, string
import re
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics.pairwise import manhattan_distances as md
from sklearn.metrics.pairwise import euclidean_distances as ed
from sklearn.metrics import jaccard_similarity_score as jsc
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import numpy as np
from nltk.metrics import jaccard_distance
from sklearn.neural_network import MLPRegressor
from nltk.corpus import wordnet as wn
from nltk.tag import PerceptronTagger
from nltk.metrics import jaccard_distance 
from spellchecker import SpellChecker
from nltk import word_tokenize, pos_tag
from nltk.metrics import jaccard_distance

import csv
import warnings
warnings.filterwarnings('ignore')

## Statement
- Use data set and description of task Semantic Textual Similarity in SemEval 2012.
- Implement some approaches to detect paraphrase using sentence similarity metrics.
    + Explore some lexical dimensions. (Only word)
    + Explore the syntactic dimension alone. (Word respect to sentence)
    + Explore the combination of both previous.
- Add new components at your choice (optional).
- Compare and comment the results achieved by these approaches among them and among the official results.
- Send files to raco in IHLT STS Project before the oral presentation:
    + Jupyter notebook: sts-[Student1]-[Student2].ipynb
    + Slides: sts-[Student1]-[Student2].pdf


In [499]:
train_path = '../data/train/'
test_path = '../data/test-gold/'

def load_and_concat(data_path):
    files = os.listdir(data_path)
    all_data = pd.DataFrame(columns=['sentence0','sentence1'])
    all_labels = pd.DataFrame(columns=['labels'])
    for file in files: 
        path = data_path + file
        if 'input' in file:
            print(path)
            fd = pd.read_csv(path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
            all_data = all_data.append(fd)
            fd = pd.read_csv(path.replace('input','gs'), sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)
            all_labels = all_labels.append(fd,ignore_index=True)
    return all_data.reset_index(drop=True), all_labels.reset_index(drop=True)

original_test, test_gs = load_and_concat(test_path)


train_df, train_gs = load_and_concat(train_path)
test_df, test_gs = load_and_concat(test_path)

train_df.shape, train_gs.shape,test_df.shape, test_gs.shape

../data/test-gold/STS.input.MSRpar.txt
../data/test-gold/STS.input.MSRvid.txt
../data/test-gold/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.surprise.SMTnews.txt
../data/test-gold/STS.input.surprise.OnWN.txt
../data/train/STS.input.MSRpar.txt
../data/train/STS.input.MSRvid.txt
../data/train/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.MSRpar.txt
../data/test-gold/STS.input.MSRvid.txt
../data/test-gold/STS.input.SMTeuroparl.txt
../data/test-gold/STS.input.surprise.SMTnews.txt
../data/test-gold/STS.input.surprise.OnWN.txt


((2234, 2), (2234, 1), (3108, 2), (3108, 1))

In [493]:
train_df.head()

Unnamed: 0,sentence0,sentence1
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl..."
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent..."
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....


In [500]:
def correct_dataset(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(auto_spell)
    return dataset

corrected_train = correct_dataset(train_df)
corrected_train.to_csv('corrected_train.csv')
corrected_test = correct_dataset(test_df)
corrected_test.to_csv('corrected_test.csv')

In [501]:
train_df = corrected_train
test_df = corrected_test

In [502]:
def auto_spell(text):
    spell = SpellChecker()
    misspelled = spell.unknown(text.split())
    tagger = PerceptronTagger()
    tagged_text = tagger.tag(text)
#     print(misspelled)
    corrected_text = ''
    for i, word in enumerate(text.split()):
        if word in misspelled:
            tag = tagged_text[i][1]
            if tag != 'NNP':
                word = spell.correction(word)
        corrected_text +=word +' '
    return corrected_text.strip()

def sentence_lenght(s):
    return len(s.split())

def count_symbols(s):
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    return count(s,set(string.punctuation))

def count_shared_words(s0,s1):
    list3 = list(set(lemmatize_text(s0.lower()))&set(lemmatize_text(s1.lower())))
    return len(list3)

def count_digits(s):
    numbers = sum(c.isdigit() for c in s)
    return numbers

def synonim_words(a,b):
    return len(set(_get_word_synonyms(a))&set(_get_word_synonyms(b))) > 0
                                                                      
def count_synonims(s0,s1):
    sinonim = 0
    for a in s0.split():
        for b in s1.split():
            sinonim += synonim_words(a,b)
    return sinonim

def count_common_propper_nouns(s0,s1):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    s1_tags = tagger.tag(s1.split())
    NNP_s0 = [values[0] for values in s0_tags if values[1] =='NNP']
    NNP_s1 = [values[0] for values in s1_tags if values[1] =='NNP']
    return len(set(NNP_s0)&set(NNP_s1))

def count_nouns(s0):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    NN_s0 = [values[0] for values in s0_tags if values[1] =='NN']
    return len(NN_s0)

def count_verbs(s0):
    tagger = PerceptronTagger()
    s0_tags = tagger.tag(s0.split())
    V_s0 = [values[0] for values in s0_tags if values[1] =='VBP']
    return len(V_s0)

def remove_stop_words(s0):
    new_vector = ' '.join(word for word in s0.split() if word.lower() not in list(stopwords.words('english')))
    return new_vector

def lemmatize_text(text):
    tagger = PerceptronTagger()
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    s_tokenized = w_tokenizer.tokenize(text)
    s_tagged = tagger.tag(s_tokenized)
    return [lemmatizer.lemmatize(w[0],penn_to_wn(w[1])) for w in s_tagged]

def calculate_jaccard(s0,s1):
    lemms_0 = [a for  a in lemmatize_text(s0) if a]
    lemms_1 = [a for  a in lemmatize_text(s1) if a]
    
    jaccard_simmilarity = (1 - jaccard_distance(set(lemms_0), set(lemms_1)))
    return jaccard_simmilarity

def _get_word_synonyms(word):
    word_synonyms = []
    for synset in wordnet.synsets(word):
        for lemma in synset.lemma_names():
            word_synonyms.append(lemma)
    return word_synonyms

def synonim_proportion(s0,s1):
    syn_count = 0
    for a in s0.split():
        synonims_a = _get_word_synonyms(a)
        for b in s1.split():
            synonims_b = _get_word_synonyms(b)
            if a ==b:
                are_syns = 1
            else:
                are_syns = len(set(_get_word_synonyms(a))&set(_get_word_synonyms(b))) > 0
#             print(a,b,are_syns)
            syn_count += are_syns
    max_len = min([len(s0.split()),len(s1.split())])
#     print(syn_count, max_len)
    return syn_count/max_len


def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
    return 'n'
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
    
def sentence_similarity(sentence1, sentence2,similarity=wn.path_similarity):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(lemmatize_text(sentence1))
    sentence2 = pos_tag(lemmatize_text(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

    score, count = 0.0, 0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        similarities = [similarity(synset,ss) for ss in synsets2 if similarity(synset,ss) ]
        try:
            best_score = max(similarities)
        except:
            best_score = 0
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
 
    # Average the values
    try:
        score /= count
    except:
        score = 0
    return score

def feature_extractor(dataset):
    for column in dataset.columns:
        dataset[column] = dataset[column].apply(remove_stop_words)
        dataset[column] = dataset[column].str.replace('\d+', '')

#         dataset[column] = dataset[column].apply(auto_spell)
    features = pd.DataFrame(columns=['sentence_0_lengh','sentence_1_lengh',
                                    'number_of_nouns_s0', 'number_of_nouns_s1',
                                    'number_of_verbs_s0', 'number_of_verbs_s1',
                                    'number_of_symbols_s0','number_of_symbols_s1',
                                   'number_of_digits_s0','number_of_digits_1',
                                   'synonim_proportion','quantity_of_shared_words', 
                                    'proper_nouns_shared','jaccard_distance','path_similarity',
                                    'wup_similarity'])
    for index, row in dataset.iterrows():
        s0 = row['sentence0']
        s1 = row['sentence1']
        features.loc[index,'jaccard_distance'] = calculate_jaccard(s0,s1)
        features.loc[index,'path_similarity'] = sentence_similarity(s0,s1,wn.path_similarity)
        features.loc[index,'wup_similarity'] = sentence_similarity(s0,s1,wn.wup_similarity)
        features.loc[index,'proper_nouns_shared'] = count_common_propper_nouns(s0,s1)
        features.loc[index,'quantity_of_shared_words'] = count_shared_words(s0,s1)
        features.loc[index,'synonim_proportion'] = synonim_proportion(s0,s1)
        features.loc[index,'sentence_0_lengh'] = sentence_lenght(s0)
        features.loc[index,'sentence_1_lengh'] = sentence_lenght(s1)
        features.loc[index,'number_of_nouns_s0'] = count_nouns(s0)
        features.loc[index,'number_of_nouns_s1'] = count_nouns(s1)
        features.loc[index,'number_of_verbs_s0'] = count_verbs(s0)
        features.loc[index,'number_of_verbs_s1'] = count_verbs(s1)
        features.loc[index,'number_of_symbols_s0'] = count_symbols(s0)
        features.loc[index,'number_of_symbols_s1'] = count_symbols(s1)
        features.loc[index,'number_of_digits_s0'] = count_digits(s0)
        features.loc[index,'number_of_digits_1'] = count_digits(s1)
    return features    

In [503]:
train_features = feature_extractor(train_df)

In [504]:
train_features.head()

Unnamed: 0,sentence_0_lengh,sentence_1_lengh,number_of_nouns_s0,number_of_nouns_s1,number_of_verbs_s0,number_of_verbs_s1,number_of_symbols_s0,number_of_symbols_s1,number_of_digits_s0,number_of_digits_1,synonim_proportion,quantity_of_shared_words,proper_nouns_shared,jaccard_distance,path_similarity,wup_similarity
0,16,12,2,2,0,0,0,0,0,0,0.75,9,1,0.473684,0.58545,0.681905
1,7,11,1,1,0,0,0,1,0,0,0.714286,5,0,0.384615,0.690476,0.770833
2,9,10,3,1,1,1,0,1,0,0,0.777778,5,0,0.357143,0.682292,0.732955
3,12,17,4,5,0,0,0,0,0,0,1.16667,11,0,0.611111,0.907407,0.949495
4,11,11,4,2,0,0,4,7,0,0,1.09091,4,0,0.1875,0.331349,0.4155


In [505]:
test_features = feature_extractor(test_df)

In [506]:
test_features.shape

(3108, 16)

In [507]:
def lemmatize_text(text):
    tagger = PerceptronTagger()
    lemmatizer = WordNetLemmatizer()
    w_tokenizer = WhitespaceTokenizer()
    s_tokenized = w_tokenizer.tokenize(text)
    s_tagged = tagger.tag(s_tokenized)
    return [lemmatizer.lemmatize(w[0],penn_to_wn(w[1])) for w in s_tagged]


def preprocessing(data, return_array = False):
    # todo: better handling of na
    data = data.fillna('')
    for column in data.columns:
        print(column)
        # remove the digits and puntuation
#         data[column] = data[column].str.replace('\d+', '')
        # convert to lowercase
        data[column] = data[column].str.replace('\W+', ' ')
        # replace continuous white spaces by a single one
        data[column] = data[column].str.replace('\s+', ' ')
        # words to lower
        data[column] = data[column].str.lower()
        # spell corrector 
        # data[column] = data[column].apply(auto_spell)
        # lematize
        data[column] = data[column].apply(lemmatize_text)
        if not return_array:
            data[column] = data[column].str.join(' ')
    return data

In [508]:
train_df = preprocessing(train_df)
test_df = preprocessing(test_df)

sentence0
sentence1
sentence0
sentence1


In [509]:
train_df.head()

Unnamed: 0,sentence0,sentence1
0,source close sale say vivendi keep door open b...,source close sale say vivendi keep door open b...
1,micron declare first quarterly profit three year,micron s number also mark first quarterly prof...
2,fine part fail republican effort force entice ...,perry say back senate s effort include fine fo...
3,american anglican council represent episcopali...,american anglican council represent episcopali...
4,tech load nasdaq composite rise point end high...,technology lace nasdaq composite index ixic cl...


In [510]:
test_df.head()

Unnamed: 0,sentence0,sentence1
0,problem likely mean corrective change shuttle ...,say problem need correct space shuttle fleet c...
1,technology lace nasdaq composite index ixic in...,broad standard door index spx inched point per...
2,let s huge black eye say publisher arthur chs ...,let s huge black eye arthur sulzberger newspap...
3,sec chairman william donaldson say building co...,think three s build confidence cop beat
4,vivendi share close percent euro paris fall pe...,new work vivendi share percent


### Lexical 

In [511]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1 - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess



guess_lex_train = lexical_simmilarity(train_df)
guess_lex_test = lexical_simmilarity(test_df)

print('train pearson: ', pearsonr(guess_lex_train['labels'], train_gs['labels'])[0])
print('test pearson: ', pearsonr(guess_lex_test['labels'], test_gs['labels'])[0])

train pearson:  0.7232529860134881
test pearson:  0.5724291362690254


## TfidVectorizer

In [512]:
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train_df['sentence1']) + list(train_df['sentence0']) )

def return_simil(a,b):
    simil = tfv.transform([a,b])
    return ((simil * simil.T).A)[0,1]

def calculate_all_sims(df):
    results = []
    for i in df.values:
        results.append(return_simil(i[0], i[1]))
    return results


all_sims = calculate_all_sims(train_df)
test_sims = calculate_all_sims(test_df)

print('train pearson: ', pearsonr(all_sims, train_gs['labels'])[0])
print('test pearson:', pearsonr(test_sims, test_gs['labels'])[0])

train pearson:  0.5274474313709707
test pearson: 0.5896542170443816


## Merged train with TfidVectorizer

In [513]:
merged_sentences = train_df['sentence0'] + train_df['sentence1']
merged_test = test_df['sentence0'] + test_df['sentence1']

vectorizer = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
merged_train = vectorizer.fit_transform(merged_sentences)
merged_test = vectorizer.transform(merged_test)

## Bag of Words

In [514]:
def train_dictionary(df):
    
    sentences_tokenized = df.sentence0.tolist() + df.sentence1.tolist()
    
    dictionary = corpora.Dictionary(sentences_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary
    
def get_vectors(df, dictionary):
    
    sentence0_vec = [dictionary.doc2bow(text) for text in df.sentence0.tolist()]
    sentence1_vec = [dictionary.doc2bow(text) for text in df.sentence1.tolist()]
    
    sentence0_csc = gensim.matutils.corpus2csc(sentence0_vec, num_terms=len(dictionary.token2id))
    sentence1_csc = gensim.matutils.corpus2csc(sentence1_vec, num_terms=len(dictionary.token2id))
    
    return sentence0_csc.transpose(),sentence1_csc.transpose()

tokenized_train = preprocessing(train_df, return_array = True)
dictionary = train_dictionary(tokenized_train)
print ("No of words in the dictionary = %s" %len(dictionary))

tokenized_test = preprocessing(test_df, return_array = True)

q1_csc, q2_csc = get_vectors(tokenized_train, dictionary)
q1_csc_test, q2_csc_test = get_vectors(tokenized_test, dictionary)

print (q1_csc.shape)
print (q1_csc_test.shape)

train_bog = np.concatenate((q1_csc.todense(), q2_csc.todense()), axis=1)
test_bog = np.concatenate((q1_csc_test.todense(), q2_csc_test.todense()), axis=1)

sentence0
sentence1
No of words in the dictionary = 1727
sentence0
sentence1
(2234, 1727)
(3108, 1727)


## Bag of words extended

In [515]:
train_bog_extended = pd.concat([pd.DataFrame(train_bog),train_features],axis=1)
test_bog_extended = pd.concat([pd.DataFrame(test_bog),test_features],axis=1)

## Trying models

In [516]:
def test_model(model,xtrain,xtest):
    train_predicted =  model.predict(xtrain)
    test_predicted =   model.predict(xtest)
    print('train pearson: ', pearsonr(train_predicted, train_gs['labels'])[0])
    print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])

def train_and_test_model(model, train,test,model_name='model'):
    model.fit(train,train_gs)
    test_model(model,train,test)
    if model_name == 'rfr':
        print_feature_importance(rfr,train)

    

def print_feature_importance(rfr,train):
    importances=rfr.feature_importances_ ## get the feature importance
    # print("Original ",np.argsort(importances))
    indices = np.argsort(importances)[::-1]
    try:
        feat_labels = train.columns
    except:
        return
    for f in range(10):
        print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]],
                                        importances[indices[f]]))
        
def run_with_all_datasets(model,model_name):
    print(model_name)
    print('Only Features')
    train_and_test_model(model,train_features,test_features,model_name)
#     print('Only TifVectorizer')
#     train_and_test_model(model,merged_train,merged_test,model_name)
    print('Only Bag of Words')
#     train_and_test_model(model,train_bog,test_bog,model_name)
    print('Bag of Words + features')
    train_and_test_model(model,train_bog_extended,test_bog_extended,model_name)

def show_worst_test(predicted, k=5):
    print('Worst results in voting:')
    err = np.abs(predicted - test_gs['labels'])
    idx = np.argpartition(err, -k)[-k:]
    for i in idx:
        print(test_df.loc[i,'sentence0'],'\n',test_df.loc[i,'sentence1'] ,'\noriginal',test_gs.loc[i,'labels'],'predicted', predicted[i])
        print(original_test.loc[i,'sentence0'],'\n',original_test.loc[i,'sentence1'] )
        print(test_features.loc[i,['jaccard_distance','path_similarity',
                                    'wup_similarity']])
        print('-------------------------------------------')

## Neural Networks

In [517]:
model_nn = MLPRegressor(hidden_layer_sizes=(2,),validation_fraction=0.3, alpha=0.3,warm_start=False,max_iter=1000)
# run_with_all_datasets(model_nn,'Neural networks')

## NN CV

In [518]:
parameters = {'alpha': 10.0 ** -np.arange(0, 5), 'max_iter':[1000],
              'hidden_layer_sizes':np.arange(1, 8),'solver': ['lbfgs','adam'],'warm_start': [False]}
nn_cv = GridSearchCV(MLPRegressor(), parameters, n_jobs=-1)
# run_with_all_datasets(nn_cv,'nn cv')

## Random Forest

In [519]:
rfr = RandomForestRegressor(n_jobs=-1,n_estimators=1000)
# run_with_all_datasets(rfr,'rfr')

In [520]:
rfr.fit(train_bog_extended, train_gs['labels'])
print_feature_importance(rfr,train_bog_extended)

 1) quantity_of_shared_words       0.492144
 2) synonim_proportion             0.058598
 3) 43                             0.033117
 4) jaccard_distance               0.028599
 5) path_similarity                0.027187
 6) sentence_1_lengh               0.024874
 7) wup_similarity                 0.021678
 8) sentence_0_lengh               0.012108
 9) number_of_nouns_s1             0.011942
10) number_of_symbols_s0           0.011475


In [521]:
test_predicted = rfr.predict(test_bog_extended)
print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])

test pearson:  0.7213255236883195


In [522]:
show_worst_test(test_predicted)

Worst results in voting:
act control property 
 state fact owner 
original 4.25 predicted 0.00025
The act of having and controlling property. 
 the state or fact of being an owner.
jaccard_distance           0
path_similarity     0.109007
wup_similarity      0.327561
Name: 2517, dtype: object
-------------------------------------------
sex 
 sexual intercourse 
original 5.0 predicted 0.07465799999999999
 have sex with 
 have sexual intercourse with.
jaccard_distance            0
path_similarity     0.0769231
wup_similarity       0.142857
Name: 2664, dtype: object
-------------------------------------------
resist 
 act opposition 
original 4.5 predicted 0.072365
be against, resist 
 act against or in opposition to.
jaccard_distance    0
path_similarity     0
wup_similarity      0
Name: 2586, dtype: object
-------------------------------------------
concern affair 
 situation event think 
original 4.5 predicted 0.04339700000000003
a concern or affair 
 some situation or event that is th

In [None]:
sentence0 = 'kangroo eat something' 
sentence1 = 'kangaroo eat'
sentence_similarity(sentence1, sentence2,similarity=wn.path_similarity)

## SVM

In [None]:
svr = SVR()
run_with_all_datasets(svr,'svr')

## Try using all the distances

In [None]:
minkowski_dis = DistanceMetric.get_metric('minkowski')
mms_scale_man = MinMaxScaler()
mms_scale_euc = MinMaxScaler()
mms_scale_mink = MinMaxScaler()

def get_similarity_values(q1_csc, q2_csc):
    cosine_sim = []
    manhattan_dis = []
    eucledian_dis = []
    jaccard_dis = []
    minkowsk_dis = []
    
    for i,j in zip(q1_csc, q2_csc):
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
        sim = md(i,j)
        manhattan_dis.append(sim[0][0])
        sim = ed(i,j)
        eucledian_dis.append(sim[0][0])
        i_ = i.toarray()
        j_ = j.toarray()
        try:
            sim = jsc(i_,j_)
            jaccard_dis.append(sim)
        except:
            jaccard_dis.append(0)
            
        sim = minkowski_dis.pairwise(i_,j_)
        minkowsk_dis.append(sim[0][0])
    
    return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis    


# cosine_sim = get_cosine_similarity(q1_csc, q2_csc)
cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis = get_similarity_values(q1_csc, q2_csc)

In [None]:
from sklearn.metrics import log_loss

def calculate_logloss(y_true, y_pred):
    loss_cal = log_loss(y_true, y_pred)
    return loss_cal

y_pred_cos, y_pred_man, y_pred_euc, y_pred_jac, y_pred_mink = get_similarity_values(q1_csc_test, q2_csc_test)
predictions = [y_pred_cos, y_pred_man, y_pred_euc, y_pred_jac, y_pred_mink]
for test_predicted in predictions:
    print('test pearson: ', pearsonr(test_predicted, test_gs['labels'])[0])



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

X_train = pd.DataFrame({"cos" : cosine_sim, "man" : manhattan_dis, "euc" : eucledian_dis, "jac" : jaccard_dis, "min" : minkowsk_dis})

X_test = pd.DataFrame({"cos" : y_pred_cos, "man" : y_pred_man, "euc" : y_pred_euc, "jac" : y_pred_jac, "min" : y_pred_mink})

rfr = RandomForestRegressor()
rfr.fit(X_train,train_gs.values.ravel())

svr = SVR()
svr.fit(X_train,train_gs.values.ravel())

y_rfr_predicted = rfr.predict(X_test)
y_svr_predicted = svr.predict(X_test)

print('test pearson: ', pearsonr(y_rfr_predicted, test_gs['labels'])[0])
print('test pearson: ', pearsonr(y_svr_predicted, test_gs['labels'])[0])
