In [1]:
import os
import time
import sys
import re
from subprocess import call
import numpy as np
from nltk import TweetTokenizer
from nltk.tokenize import StanfordTokenizer

In [2]:

FASTTEXT_EXEC_PATH = os.path.abspath("./fasttext")

BASE_SNLP_PATH = "/home/cmps143a1/workspace/sent2vec/stanford/stanford-postagger-2016-10-31"
SNLP_TAGGER_JAR = os.path.join(BASE_SNLP_PATH, "stanford-postagger.jar")

MODEL_WIKI_UNIGRAMS = os.path.abspath("./model.bin")
MODEL_WIKI_BIGRAMS = os.path.abspath("./sent2vec_wiki_bigrams")
MODEL_TWITTER_UNIGRAMS = os.path.abspath('./sent2vec_twitter_unigrams')
MODEL_TWITTER_BIGRAMS = os.path.abspath('./sent2vec_twitter_bigrams')


In [3]:
def tokenize(tknzr, sentence, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',sentence) #replace urls by <url>
    sentence = re.sub('(\@[^\s]+)','<user>',sentence) #replace @user268 by <user>
    filter(lambda word: ' ' not in word, sentence)
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

def tokenize_sentences(tknzr, sentences, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentences: a list of sentences
        - to_lower: lowercasing or not
    """
    return [tokenize(tknzr, s, to_lower) for s in sentences]

In [7]:
def get_embeddings_for_preprocessed_sentences(sentences, model_path, fasttext_exec_path):
    """Arguments:
        - sentences: a list of preprocessed sentences
        - model_path: a path to the sent2vec .bin model
        - fasttext_exec_path: a path to the fasttext executable
    """
    timestamp = str(time.time())
    test_path = os.path.abspath('./'+timestamp+'_fasttext.test.txt')
    embeddings_path = os.path.abspath('./'+timestamp+'_fasttext.embeddings.txt')
    dump_text_to_disk(test_path, sentences)
    call(fasttext_exec_path+
          ' print-vectors '+
          model_path + ' < '+
          test_path + ' > ' +
          embeddings_path, shell=True)
    embeddings = read_embeddings(embeddings_path)
    os.remove(test_path)
    os.remove(embeddings_path)
    assert(len(sentences) == len(embeddings))
    return np.array(embeddings)

def read_embeddings(embeddings_path):
    """Arguments:
        - embeddings_path: path to the embeddings
    """
    with open(embeddings_path, 'r') as in_stream:
        embeddings = []
        for line in in_stream:
            line = '['+line.replace(' ',',')+']'
            embeddings.append(eval(line))
        return embeddings
    return []

def dump_text_to_disk(file_path, X, Y=None):
    """Arguments:
        - file_path: where to dump the data
        - X: list of sentences to dump
        - Y: labels, if any
    """
    with open(file_path, 'w') as out_stream:
        if Y is not None:
            for x, y in zip(X, Y):
                out_stream.write('__label__'+str(y)+' '+x+' \n')
        else:
            for x in X:
                out_stream.write(x+' \n')

def get_sentence_embeddings(sentences, ngram='bigrams', model='concat_wiki_twitter'):
    """ Returns a numpy matrix of embeddings for one of the published models. It
    handles tokenization and can be given raw sentences.
    Arguments:
        - ngram: 'unigrams' or 'bigrams'
        - model: 'wiki', 'twitter', or 'concat_wiki_twitter'
        - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...]
    """
    wiki_embeddings = None
    twitter_embbedings = None
    tokenized_sentences_NLTK_tweets = None
    tokenized_sentences_SNLP = None
    if model == "wiki" or model == 'concat_wiki_twitter':
#         tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
#         s = ' <delimiter> '.join(sentences) #just a trick to make things faster
#         tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])
#         tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ')
        tokenized_sentences_SNLP = sentences
        assert(len(tokenized_sentences_SNLP) == len(sentences))
        if ngram == 'unigrams':
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \
                                     MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter" or model == 'concat_wiki_twitter':
        tknzr = TweetTokenizer()
        tokenized_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences)
        if ngram == 'unigrams':
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH)
        else:
            twitter_embbedings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_NLTK_tweets, \
                                     MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH)
    if model == "twitter":
        return twitter_embbedings
    elif model == "wiki":
        return wiki_embeddings
    elif model == "concat_wiki_twitter":
        return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1)
    sys.exit(-1)

In [8]:
sentences = ['Once upon a time.', 'And now for something completely different.']

import time
s = time.time()
my_embeddings = get_sentence_embeddings(sentences, ngram='unigrams', model='wiki')
print(time.time() - s)
print(my_embeddings.shape)

4.0213518142700195
(2, 600)


In [10]:
import nltk
import sys
import pickle

def absoluteFilePaths(directory):
    for dirpath,_,filenames in os.walk(directory):
        for f in filenames:
            yield os.path.abspath(os.path.join(dirpath, f))

blog = ['storm']
docs = []
for f in [f for mypath in blog for f in list(absoluteFilePaths('blogs_topic_sorted/blogs_topic_sorted/'+mypath+'/train'))]:
    with open(f, 'r') as ff:
        docs.append(ff.read())

embeddings = []
i=0
for doc in docs[0:1000]:
    sents = []
    for sent in nltk.sent_tokenize(doc):
        sents.append(sent)
    embeddings.append(get_sentence_embeddings(sents, ngram='unigrams', model='wiki'))
    print(str(i) + ":"+str(len(docs)))
    sys.stdout.flush()    
    i+=1
pickle.dump( embeddings, open( "stormembeddings", "wb" ) )
print("done")
sys.stdout.flush()

0:1234
1:1234
2:1234
3:1234
4:1234
5:1234
6:1234
7:1234
8:1234
9:1234
10:1234
11:1234
12:1234
13:1234
14:1234
15:1234
16:1234
17:1234
18:1234
19:1234
20:1234
21:1234
22:1234
23:1234
24:1234
25:1234
26:1234
27:1234
28:1234
29:1234
30:1234
31:1234
32:1234
33:1234
34:1234
35:1234
36:1234
37:1234
38:1234
39:1234
40:1234
41:1234
42:1234
43:1234
44:1234
45:1234
46:1234
47:1234
48:1234
49:1234
50:1234
51:1234
52:1234
53:1234
54:1234
55:1234
56:1234
57:1234
58:1234
59:1234
60:1234
61:1234
62:1234
63:1234
64:1234
65:1234
66:1234
67:1234
68:1234
69:1234
70:1234
71:1234
72:1234
73:1234
74:1234
75:1234
76:1234
77:1234
78:1234
79:1234
80:1234
81:1234
82:1234
83:1234
84:1234
85:1234
86:1234
87:1234
88:1234
89:1234
90:1234
91:1234
92:1234
93:1234
94:1234
95:1234
96:1234
97:1234
98:1234
99:1234
100:1234
101:1234
102:1234
103:1234
104:1234
105:1234
106:1234
107:1234
108:1234
109:1234
110:1234
111:1234
112:1234
113:1234
114:1234
115:1234
116:1234
117:1234
118:1234
119:1234
120:1234
121:1234
122:1234
123

923:1234
924:1234
925:1234
926:1234
927:1234
928:1234
929:1234
930:1234
931:1234
932:1234
933:1234
934:1234
935:1234
936:1234
937:1234
938:1234
939:1234
940:1234
941:1234
942:1234
943:1234
944:1234
945:1234
946:1234
947:1234
948:1234
949:1234
950:1234
951:1234
952:1234
953:1234
954:1234
955:1234
956:1234
957:1234
958:1234
959:1234
960:1234
961:1234
962:1234
963:1234
964:1234
965:1234
966:1234
967:1234
968:1234
969:1234
970:1234
971:1234
972:1234
973:1234
974:1234
975:1234
976:1234
977:1234
978:1234
979:1234
980:1234
981:1234
982:1234
983:1234
984:1234
985:1234
986:1234
987:1234
988:1234
989:1234
990:1234
991:1234
992:1234
993:1234
994:1234
995:1234
996:1234
997:1234
998:1234
999:1234


NameError: name 'pickle' is not defined

In [35]:
import pickle
embeddings = pickle.load( open( "stormembeddings", "rb" ) )
# embeddings = embeddings[0:5]

In [36]:
import numpy as np
def createrand(embeddings):
    size = 0
    for doc in embeddings:
        size += (len(doc)-1)
    doc_no = np.random.randint(0, len(embeddings), size=size)
    
    jagged = []
    size=0
    for doc in embeddings:
        a = []
        for sent_idx in range(0, len(doc)-1):
            a.append(np.random.randint(0, len(embeddings[doc_no[size]]), size=1)[0])
            size+=1
        jagged.append(a)
    return doc_no, jagged

In [37]:
import numpy as np
doc_no, jagged = createrand(embeddings)
X = []
Y = []
size=0
for doc_idx in range(0, len(embeddings)):
    doc = embeddings[doc_idx]
    for sent_idx in range(0, len(doc)-1):
        #positive
        X.append(np.append(doc[sent_idx], doc[sent_idx+1]))
        Y.append(1)
        #negative
        #for this sent get any arbitrary sent in corpus
        a = doc_no[size]
        b = jagged[doc_idx][sent_idx]
        X.append(np.append(doc[sent_idx], embeddings[a][b]))
        Y.append(0)
        size+=1

import pickle
pickle.dump( (X,Y), open( "svm.data", "wb" ) )
print("done")

done


In [None]:
from sklearn import svm
import pickle
X,Y = pickle.load( open( "svm.data", "rb" ) )
clf = svm.SVC()

print("fitting")
sys.stdout.flush()
clf.fit(X, Y)
pickle.dump( clf, open( "svm.model1111", "wb" ) )
print("done")

In [1]:
from sklearn import svm
import numpy as np
import sys
import pickle
X,Y = pickle.load( open( "svm.data", "rb" ) )
X = np.array([np.array(xi) for xi in X])
Y = np.array([np.array(xi) for xi in Y])
clf = pickle.load( open( "svm.model1111", "rb" ) )

def getAccuracy(X, Y, clf): 
    
    rand_index = np.random.choice(len(X), size=1000)
    X = X[rand_index]
    Y = Y[rand_index]
    ypred = clf.predict(X)
    count1 = 0 
    count2 = 0 
    count3 = 0 
    count4 = 0 
    for i in range(0, 1000):
        if(ypred[i] == 0): 
            if(Y[i] == 0): 
                count1 += 1
            else:
                count2 += 1
        else:
            if(Y[i] == 0): 
                count3 += 1
            else:
                count4 += 1
    sys.stdout.write("pred = 0, tar = 0: " + str(count1) + "\n")
    sys.stdout.write("pred = 0, tar = 1: " + str(count2) + "\n")
    sys.stdout.write("pred = 1, tar = 0: " + str(count3) + "\n")
    sys.stdout.write("pred = 1, tar = 1: " + str(count4) + "\n")
    sys.stdout.write("done\n")
    sys.stdout.flush()

getAccuracy(X, Y, clf)


pred = 0, tar = 0: 305
pred = 0, tar = 1: 186
pred = 1, tar = 0: 188
pred = 1, tar = 1: 321
done
