# create bow-sentence embeddings

using the word vectors trained in previous

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

In [2]:
# read data and format (strip punct, lower, # token replace)
rawdata = pd.read_csv("data/merged_corpus_phrases_new.csv", header=None)
rawdata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,5523.0,0,amex_01a.xml,1,caller,my name is B C. and i would like to plan a trip.,내 이름은 B. C. 나는 여행을 계획하고 싶습니다.,stateIntent,none
1,5524.0,1,amex_01a.xml,2,agent,and the date you need to leave?,떠나야하는 날짜?,reqInfo,date
2,5525.0,2,amex_01a.xml,3,caller,well um i have to work a little bit backwards ...,"음, 여기서 좀 거꾸로 돌아와야 해.",answer-stateConstraint,location
3,5526.0,3,amex_01a.xml,4,agent,ok.,예.,acknowledge,none
4,5527.0,4,amex_01a.xml,5,caller,i want to leave on the first flight out of ORD...,나는 ORD에서 첫 비행을 떠나고 싶다. 월요일 아침에.,refer,time-day


In [3]:
messages = rawdata[5].tolist()
def stripit(s):
    s = s.lower()
    for p in ['.', ',', '?', '-']:
        s = s.replace(p, '')
    for n in ['1','2','3','4','5','6','7','8','9','0']:
        s = s.replace(n, '#')
    return s
messages = [stripit(s) for s in messages]
sent_toks = [s.split() for s in messages]

In [4]:
speechacts = rawdata[7].tolist()
np.save("dbases/speech_acts.npy", speechacts)
topics = rawdata[8].tolist()
np.save("dbases/topics.npy", topics)

In [5]:
vectors = np.load("dbases/w2v_word_vectors.npy")
vocab = list(np.load("dbases/w2v_word_tokens.npy"))
vectors.shape

(2095, 200)

In [6]:
def bowvect(toks, vocab, vectors):
    idxes = []
    for t in toks:
        if t in vocab:
            idxes.append(vocab.index(t))
    tokvects = vectors[idxes]
    if tokvects.shape[0] == 0:
        tokvects = np.zeros(vectors.shape[1])
    sentvect = np.mean(tokvects, axis=0)
    return sentvect

In [7]:
bowvect(["london", "euston"], vocab, vectors).shape

(200,)

# sent vects and save

In [8]:
sent_vects = []
for s in sent_toks:
    sent_vects.append(bowvect(s, vocab, vectors))

In [9]:
def sent_sim(s1, s2, vocab, vectors):
    v1 = bowvect(stripit(s1), vocab, vectors)
    v2 = bowvect(stripit(s2), vocab, vectors)
    similarity = 1 - cosine(v1, v2)
    return similarity

In [10]:
sent_sim("to london euston on october 3rd", "to london on september 1st", vocab, vectors)

0.91372931003570557

In [11]:
sent_sim("to london euston on october #rd", "hello sandra speaking", vocab, vectors)

0.55625003576278687

In [12]:
# save

In [13]:
np.save("dbases/w2v_sent_vectors.npy", sent_vects)
np.save("dbases/w2v_sent_tokens.npy", messages)

# checking class

In [14]:
class SentSimClass:
    
    def __init__(self, wvocab, wvectors, sentences):
        self.vocab = wvocab
        self.vectors = wvectors
        self.sentences = sentences
        self.sent_toks = [self._stripit(s) for s in sentences]
        self.sent_vects = [self._bowvect(s) for s in sent_toks]
        
    def _stripit(self, s):
        s = s.lower()
        for p in ['.', ',', '?', '-']:
            s = s.replace(p, '')
        for n in ['1','2','3','4','5','6','7','8','9','0']:
            s = s.replace(n, '#')
        return s
    
    def _bowvect(self, toks):
        idxes = []
        for t in toks:
            if t in self.vocab:
                idxes.append(self.vocab.index(t))
        tokvects = self.vectors[idxes]
        if tokvects.shape[0] == 0:
            tokvects = np.zeros(self.vectors[0].shape[0])
        sentvect = np.mean(tokvects, axis=0)
        return sentvect
    
    def sent_sim(self, s1, s2):
        v1 = self._bowvect(self._stripit(s1))
        v2 = self._bowvect(self._stripit(s2))
        similarity = 1 - cosine(v1, v2)
        return similarity
    
    def bestsent(self, sent, n=10):
        similarities = [self.sent_sim(sent, toks) for toks in self.sent_toks]
        tuples = list(zip(similarities, self.sentences))
        tuples = sorted(tuples, key = lambda x: x[0], reverse=True)
        return tuples[:n]
        

In [15]:
clio = SentSimClass(vocab, vectors, messages)

In [16]:
clio.bestsent('are there any flights to chicago')

[(0.98653578758239746,
  'i i hesitate uh for him particularly to get anything uh because he does change a lot '),
 (0.98597294092178345, "certainly oh ok great that's fine "),
 (0.98476600646972656, 'ok i he wants this changed also '),
 (0.98158979415893555,
  "i already have and everything in chicago ok i i had what's what's the total cost on that then "),
 (0.98130744695663452,
  "so if we do that i'm going to have to send it off to the airlines for faring and i can't guarantee how fast they'll come back with a fare "),
 (0.98115324974060059, 'uh what are we changing this to '),
 (0.98064440488815308, 'what city are they travelling to '),
 (0.98032605648040771,
  "now i'm just going to check to see what's the cheapest fare available with the first class ticket ok then "),
 (0.98006981611251831, 'ge are there any direct flights at all '),
 (0.97971230745315552, "i'm just checking it now er yeah nan that's ")]