In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import nltk
from nltk.corpus import wordnet as wn
from nltk import word_tokenize

import re




In [2]:
import nltk
from nltk import word_tokenize
import pandas as pd
#from nltk.stem import PortStemmer
#ps = PortStemmer()

def tokenize(q1, q2):
    """
        q1 and q2 are sentences/questions. Function returns a list of tokens for both.
    """
    return word_tokenize(q1), word_tokenize(q2)


def posTag(q1, q2):
    """
        q1 and q2 are lists. Function returns a list of POS tagged tokens for both.
    """
    return nltk.pos_tag(q1), nltk.pos_tag(q2)


def stemmer(tag_q1, tag_q2):
    """
        tag_q = tagged lists. Function returns a stemmed list.
    """

    stem_q1 = []
    stem_q2 = []

    for token in tag_q1:
        stem_q1.append(stem(token))

    for token in tag_q2:
        stem_q2.append(stem(token))

    return stem_q1, stem_q2

In [3]:
class Lesk(object):

    def __init__(self, sentence):
        self.sentence = sentence
        self.meanings = {}
        for word in sentence:
            self.meanings[word] = ''

    def getSenses(self, word):
        # print word
        return wn.synsets(word.lower())

    def getGloss(self, senses):

        gloss = {}

        for sense in senses:
            gloss[sense.name()] = []

        for sense in senses:
            gloss[sense.name()] += word_tokenize(sense.definition())

        return gloss

    def getAll(self, word):
        senses = self.getSenses(word)

        if senses == []:
            return {word.lower(): senses}

        return self.getGloss(senses)

    def Score(self, set1, set2):
        # Base
        overlap = 0

        # Step
        for word in set1:
            if word in set2:
                overlap += 1

        return overlap

    def overlapScore(self, word1, word2):

        gloss_set1 = self.getAll(word1)
        if self.meanings[word2] == '':
            gloss_set2 = self.getAll(word2)
        else:
            # print 'here'
            gloss_set2 = self.getGloss([wn.synset(self.meanings[word2])])

        # print gloss_set2

        score = {}
        for i in gloss_set1.keys():
            score[i] = 0
            for j in gloss_set2.keys():
                score[i] += self.Score(gloss_set1[i], gloss_set2[j])

        bestSense = None
        max_score = 0
        for i in gloss_set1.keys():
            if score[i] > max_score:
                max_score = score[i]
                bestSense = i

        return bestSense, max_score

    def lesk(self, word, sentence):
        maxOverlap = 0
        context = sentence
        word_sense = []
        meaning = {}

        senses = self.getSenses(word)

        for sense in senses:
            meaning[sense.name()] = 0

        for word_context in context:
            if not word == word_context:
                score = self.overlapScore(word, word_context)
                if score[0] == None:
                    continue
                meaning[score[0]] += score[1]

        if senses == []:
            return word, None, None

        self.meanings[word] = max(meaning.keys(), key=lambda x: meaning[x])
        return word, self.meanings[word], wn.synset(self.meanings[word]).definition()

In [4]:
import math
import numpy as np
from scipy import spatial
from nltk.corpus import wordnet as wn
from nltk.metrics import edit_distance

def path(set1, set2):
    return wn.path_similarity(set1, set2)


def wup(set1, set2):
    return wn.wup_similarity(set1, set2)


def edit(word1, word2):
    if float(edit_distance(word1, word2)) == 0.0:
        return 0.0
    return 1.0 / float(edit_distance(word1, word2))
def computePath(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0])
            else:
                sim = path(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim
    #print("Pathwise similarity matrix.")
    #print(R)

    return R
def computeWup(q1, q2):
 

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0])
            else:
                sim = wup(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

   # print(R)

    return R

In [5]:
def overallSim(q1, q2, R):

    sum_X = 0.0
    sum_Y = 0.0

    for i in range(len(q1)):
        max_i = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_i:
                max_i = R[i, j]
        sum_X += max_i

    for i in range(len(q1)):
        max_j = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_j:
                max_j = R[i, j]
        sum_Y += max_j
        
    if (float(len(q1)) + float(len(q2))) == 0.0:
        return 0.0
        
    overall = (sum_X + sum_Y) / (2 * (float(len(q1)) + float(len(q2))))

    return overall

In [6]:
def semanticSimilarity(q1, q2):

    tokens_q1, tokens_q2 = tokenize(q1, q2)
   # stem_q1, stem_q2 = stemmer(tokens_q1, tokens_q2)
    tag_q1, tag_q2 = posTag(tokens_q1, tokens_q2)

    sentence1 = []
    for i, word in enumerate(tag_q1):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence1.append(word[0])

    sense1 = Lesk(sentence1)
    sentence1Means = []
    for word in sentence1:
        sentence1Means.append(sense1.lesk(word, sentence1))

    sentence2 = []
    for i, word in enumerate(tag_q2):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence2.append(word[0])

    sense2 = Lesk(sentence2)
    sentence2Means = []
    for word in sentence2:
        sentence2Means.append(sense2.lesk(word, sentence2))
    # for i, word in enumerate(sentence1Means):
    #     print sentence1Means[i][0], sentence2Means[i][0]

    R1 = computePath(sentence1Means, sentence2Means)
    R2 = computeWup(sentence1Means, sentence2Means)

    R = (R1 + R2) / 2

    # print R

    return overallSim(sentence1Means, sentence2Means, R)
import nltk
STOP_WORDS = nltk.corpus.stopwords.words()
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")

    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)

    sentence = " ".join(sentence)
    return sentence

In [7]:
from sklearn.metrics import log_loss
import pandas as pd
X_train = pd.read_csv('F:\\sts-en-test-gs-2014\\deep-siamese-text-similarity-master\\train_snli1.csv',delimiter='\t')
col1= list(X_train["Sentence1."])
col2=list(X_train["Sentence2."])
print("\n Sentence1.")
print("\n",col1)
print("\n Sentence2.")
print("\n",col2)
#print(X_train)
#X_train = X_train.dropna(how="any")

#y = X_train['is_duplicate']
#col1 = list(X_train['Sentence.'])
print('Exported Cleaned train Data, no need for cleaning')
y_pred = []
#count = 0
print('Calculating similarity for the training data, please wait.')
for i in range(len(col2)):
    str1 = str(col1[i]) 
    #for j in range(len(col2)):
    str2 = str(col2[i])
    sim = semanticSimilarity(str1, str2)
    print("sen1["+str(i)+"]-->"+str1+","+" sen2["+str(i)+"]-->"+str2+","+"  simmilarity:"+str(sim))#+str(row[6]))
    y_pred.append(sim)
print(y_pred)
#max = y_pred[0]
#for j in range(len(y_pred)):
 #   if(y_pred[j]>= max)
  #  max = 

#printf("the most similar sentences are :")
#for i in range (len(col2)):
    
#output = pd.DataFrame(list(zip(X_train['id'].tolist(), y_pred)), columns=['id', 'similarity'])
#output.to_csv('semantic_train.csv', index=False)
#print("Log Loss Score:")
#print(log_loss(np.array(y_pred)))

'''for col in range(len(col1)):
    X_train[col] = X_train[col].apply(clean_sentence)

print(X_train)
'''
'''y_pred = []
count = 0
print('Calculating similarity for the training data, please wait.')
for row in X_train.itertuples():
     print(row)
    q1 = str(row[4])
    q2 = str(row[5])

    sim = semanticSimilarity(q1, q2)
    count += 1
    if count % 10000 == 0:
        print(str(count)+", "+str(sim)+", "+str(row[6]))
    y_pred.append(sim)
    
output = pd.DataFrame(list(zip(X_train['id'].tolist(), y_pred)), columns=['id', 'similarity'])
output.to_csv('semantic_train.csv', index=False)

print("Log Loss Score:")
print(log_loss(np.array(y_pred)))
'''



 Sentence1.

 ['A man appears to be closing a gate next to a nail-gloss ad.', 'A man appears to be closing a gate next to a nail-gloss ad.', 'A man in a blue shirt locks a blue shutter next to a nail polish advertisement.', 'A man in a blue shirt locks a blue shutter next to a nail polish advertisement.', 'A man raising or lowering a blue and red garage door next to a nail polish poster.', 'A man raising or lowering a blue and red garage door next to a nail polish poster.', 'The children in blue are singing for a group of people.', 'The children in blue are singing for a group of people.', 'Several adults are tending their children outside on a sunny day.', 'Several adults are tending their children outside on a sunny day.', 'A group of children, boys and girls, dressed in blue choir robes are standing in front of an audience.', 'A group of children, boys and girls, dressed in blue choir robes are standing in front of an audience.', 'A group of young children wearing blue robes are st

sen1[19]-->A group of young children wearing blue robes are standing in front of a crowd with their hands interlaced., sen2[19]-->Children are holding hands.,  simmilarity:0.25186011904761907
sen1[20]-->A group of young children wearing blue robes are standing in front of a crowd with their hands interlaced., sen2[20]-->A group of children stand in front of a crowd,  simmilarity:0.24172901300928984
sen1[21]-->A group of young children wearing blue robes are standing in front of a crowd with their hands interlaced., sen2[21]-->Children are swimming.,  simmilarity:0.27976190476190477
sen1[22]-->Two men standing outside wearing jackets and caps., sen2[22]-->Two men are outdoors.,  simmilarity:0.24576118326118326
sen1[23]-->Two men standing outside wearing jackets and caps., sen2[23]-->Two men are swimming.,  simmilarity:0.26262626262626265
sen1[24]-->Two men dressed jackets and hats are standing on a terrace, sen2[24]-->2 well dressed men are outside.,  simmilarity:0.27606585333858064
sen

'y_pred = []\ncount = 0\nprint(\'Calculating similarity for the training data, please wait.\')\nfor row in X_train.itertuples():\n     print(row)\n    q1 = str(row[4])\n    q2 = str(row[5])\n\n    sim = semanticSimilarity(q1, q2)\n    count += 1\n    if count % 10000 == 0:\n        print(str(count)+", "+str(sim)+", "+str(row[6]))\n    y_pred.append(sim)\n    \noutput = pd.DataFrame(list(zip(X_train[\'id\'].tolist(), y_pred)), columns=[\'id\', \'similarity\'])\noutput.to_csv(\'semantic_train.csv\', index=False)\n\nprint("Log Loss Score:")\nprint(log_loss(np.array(y_pred)))\n'