In [53]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Text processing of data
"""

import numpy as np
import pandas as pd
import os

import pickle
import nltk

from fnc_baseline.utils.score import report_score, LABELS, score_submission
from fnc_baseline.utils.dataset import DataSet

import codecs
import sys
#reload(sys) # for text processing
#sys.setdefaultencoding('utf8') # for text processing

# ======== Load data =======
base_path = '/Users/Monu/NLP/Stance/code'
def read_data(): 
    
    # Extracting data
    dataset = DataSet(path = base_path + '/data')
    stances = dataset.stances
    articles = dataset.articles
    
    # Data to lists
    h, b, y = [],[],[]
    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])
    y = np.asarray(y, dtype = np.int64)
    #print(h)
    #print(b)
    #print(y)
    return h, b, y
#read_data()

In [54]:
######################################################################################
#Another way for loading embedding

def load_embedding_from_disks(glove_filename, with_indexes=True):
    """
    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
    else:
        word_to_embedding_dict = dict()

    
    with open(glove_filename, 'r') as glove_file:
        for (i, line) in enumerate(glove_file):
            
            split = line.split(' ')
            
            word = split[0]
            
            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation]
            )
            
            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    _WORD_NOT_FOUND = [0.0]* len(representation)  # Empty representation for unknown words.
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict
    
'''def loadGloVe(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('Loaded GloVe!')
    file.close()
    return vocab,embd'''
#########################################################################################
        
    
# ----- Loading Glove embeddings ----
def loadGloVe(filename):
    #print(filename)
    # Getting embedding dimension
    file0 = open(filename,'r')
    #file0 = codecs.open(filename, 'r', 'utf8', 'ignore')
    line = file0.readline()
    emb_dim = len(line.strip().split(' ')) - 1
    file0.close()

    # First row of embedding matrix is 0 for zero padding
    vocab = ['<pad>'] #By Manisha - Using this
    embd = [[0.0] * emb_dim] #By Manisha - Using this
    #vocab = []
    #embd = []
    #model = {}
    # Reading embedding matrix
    file = open(filename,'r')
    file = codecs.open(filename, 'r', 'utf8', 'ignore')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
        #model[vocab] = embd
        #embd.append(map(float,row[1:]))
    print('Loaded GloVe!')
    file.close()
    
    return vocab,embd

In [55]:


# ------ Clean quote signs ---------
def clean_data(sentences):
    '''
    Delete quote signs
        - Rational: quote signs mix with the parsing
        - Con: quote signs are meaningul --> distanciation from a statement
    '''
    new_sentences = []
    for sentence in sentences:
        new_sentences.append(sentence.replace("'","").replace('"',''))
    return new_sentences

# ---- Build vocab dictionary from embedding matrix -----
def build_vocDict(vocab):
    voc_dict = {}
    for i in range(len(vocab)):
        #print(vocab[i])
        voc_dict[vocab[i]] = i
    return voc_dict

In [56]:

# -------- words to ids only -------

def words2ids(sentences, voc_dict, option = 'simple'):
    '''
    Inputs: 
        - sentences: list of sentences as string
        - embedding_vocab: list of vocab words in the order of the rows of embedding_matrix
    Ouptut: 
        - new_sentences_ids: list of sentences as successive word indexes
    Processing: delete word which do no appear in vocabulary
        - Alternative: replace missing words by the mean
    '''
    new_sentences_ids = []
    j = 0
    for sentence in sentences:
        j+=1
        if j % 5000 == 0:
            print ('sentence',str(j))
        sentence_ids = []
        if option == 'nltk':
            sentence = sentence.decode('utf8', 'ignore')
            # print('sentence', sentence)
            word_list = tokenize(sentence)
            print('word_list', word_list)
        elif option == 'simple':
            word_list = sentence.split(" ")
        
        for word in word_list:
            if word.lower() in voc_dict: # Only add word if in dictionary
                word_index = voc_dict[word.lower()]
                sentence_ids.append(word_index)
                
        new_sentences_ids.append(sentence_ids)
        #print ("added",j)
    return new_sentences_ids


# -------- words to ids and vectors -------
def words2ids_vects(sentences, voc_dict, embedding_matrix, option = 'simple'):   ###Check this.. :( - Manisha)
    '''
    Inputs: 
        - sentences: list of sentences as string
        - embedding_vocab: list of vocab words in the order of the rows of embedding_matrix
        - embedding_matrix
    Ouptut: 
        - new_sentences_ids: list of sentences as successive word indexes
        - new_sentences_vects: list of sentences as successive word vectors
    Processing: delete word which do no appear in vocabulary
        - Alternative: replace missing words by the mean
    '''
    #print(voc_dict)
    new_sentences_ids = []
    new_sentences_vects = []
    j = 0
    newsentences = clean_data(sentences)
    for sentence in newsentences:
        j+=1
        if j % 5000 == 0:
            print ('sentence',str(j))
        sentence_ids = []
        sentence_vects = []
        if option == 'nltk':
            #sentence = sentence.decode('utf8', 'ignore')
            # print('sentence', sentence)
            word_list = tokenize(sentence)
            #print('word_list', word_list)
        elif option == 'simple':
            word_list = sentence.split(" ")
            #print('word_list', word_list)
        for word in word_list:
            word = word.decode("utf-8")
            #print(word)
            #print(voc_dict[word])
            #print(voc_dict[word.lower()])
            if word.lower() in voc_dict: # Only add word if in dictionary                
                word_index = voc_dict[word.lower()]
                #print(word_index)
                #print(embedding_matrix[word_index])
                sentence_ids.append(word_index)
                sentence_vects.append(embedding_matrix[word_index])
                
        new_sentences_ids.append(sentence_ids)
        #print ("added", j)
        #print(sentence_vects)
        new_sentences_vects.append(sentence_vects)
    print(new_sentences_vects)
    return new_sentences_ids, new_sentences_vects

In [57]:
def tokenize(sequence):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokens = [token.replace("``", '"').replace("''", '"') for token in nltk.word_tokenize(sequence)]
    #print(tokens)
    # return tokens
    return map(lambda x:x.encode('utf-8', errors = 'ignore'), tokens)

# ---------- Averaging vectors for headline and truncated body ---------


def concatConvert_np(h_list, b_list):
    '''
    1. Concatenate headlines and bodies
    2. Convert list data to numpy zero padded data
    3. Also outputs sequences lengths as np vector
    '''
    
    # Concatenate
    n_sentences = len(h_list)
    h_b_list = []
    seqlen = []
    for i in range(n_sentences):
        h_b_list.append(h_list[i] + b_list[i])
        seqlen.append(len(h_b_list[i]))
        
    max_len = max(seqlen)
    
    # Convert to numpy with zero padding. No truncating
    h_b_np = np.zeros((n_sentences, max_len))
    for i in range(n_sentences):
        h_b_np[i,:seqlen[i]] = h_b_list[i]
    
    return h_b_list, h_b_np, np.array(seqlen)

def distinctConvert_np(h_list, b_list):
    '''
    1. Convert list data to numpy zero padded data, 2 distinct matrices for headlines and bodies 
    2. Also outputs sequences lengths as np vector
    '''
    # Compute sequences lengths
    n_sentences = len(h_list)
    h_seqlen = []
    b_seqlen = []
    for i in range(n_sentences):
        h_seqlen.append(len(h_list[i]))
        b_seqlen.append(len(b_list[i]))
        
    h_max_len = max(h_seqlen)
    b_max_len = max(b_seqlen)
    
    # Convert to numpy
    h_np = np.zeros((n_sentences, h_max_len))
    b_np = np.zeros((n_sentences, b_max_len))
    for i in range(n_sentences):
        h_np[i,:h_seqlen[i]] = h_list[i]
        b_np[i,:b_seqlen[i]] = b_list[i]
        
    return h_np, np.array(h_seqlen), b_np, np.array(b_seqlen)


In [58]:


## Updated BY Manisha
def save_data_pickle(outfilename, 
                    embedding_type = 'twitter.27B.50d',
                    parserOption = 'nltk'):
    cwd = os.getcwd()
    if embedding_type == 'twitter.27B.50d':
        #filename_embeddings = cwd + '/../../glove/glove.twitter.27B.50d.txt'
        filename_embeddings = base_path + '/glove/glove.twitter.27B.50d.txt'
    else: 
        #filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt'
        filename_embeddings = base_path + '/glove/glove.6B.50d.txt'

    # filename_embeddings = cwd + filename_embeddings

    # GloVe embeddings
    vocab, embd = loadGloVe(filename_embeddings)
    vocab_size = len(vocab)
    embedding_dim = len(embd[0])
    embedding = np.asarray(embd)
    #embedding = np.asarray(embd, dtype = object)

    # Get vocab dict
    voc_dict = build_vocDict(vocab)
    #print(voc_dict['luungan'])
    # Read and process data
    h, b, y = read_data() #read_data(cwd + '/../../') # headline / bodies/ labels
    
    h_ids, h_vects = words2ids_vects(h, voc_dict, embd, parserOption)
    #Manishacomment 
    '''b_ids, b_vects = words2ids_vects(b, voc_dict, embd, parserOption)
    #print(h_vects)
    # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector
    h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
    h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)

    data_dict = {'h_ids':h_ids, 'b_ids':b_ids, 'y':y}
    with open(outfilename, 'wb') as fp:
        pickle.dump(data_dict, fp)'''
    return vocab, embd

## Updated BY Manisha
def get_data(config, 
            filename_embeddings = '/glove/glove.twitter.27B.50d.txt',
            pickle_path = '/glove/twitter50d_h_ids_b_ids_pickle.p',
            concat = True):
    # np.random.seed(41)
    # Base path
    #cwd = os.getcwd()
    load_path = base_path + pickle_path
    #vocab, embd = save_data_pickle(load_path) #By Manisha - Comment this ones its loaded
    
    # filename_embeddings = cwd + '/../../glove/glove.6B.50d.txt'

    filename_embeddings = base_path + filename_embeddings
    
    # GloVe embeddings
    vocab, embd = loadGloVe(filename_embeddings)
    #print(vocab)
    #print(embd)
    vocab_size = len(vocab)
    embedding_dim = len(embd[0])
    #print(embd.dtype)
    #embedding = np.asarray(embd, dtype = np.float64)
    embedding = np.asarray(embd)
    #print(vocab)
    #print(embd)
    #print(embedding)

    # Get vocab dict
    voc_dict = build_vocDict(vocab)
    #print(voc_dict)
    # Read and process data
    #h, b, y = read_data() # headline / bodies/ labels
    
    print('Loading Pickle')
    #load_path = pickle_path
    with open (load_path, 'rb') as fp:
        data_dict = pickle.load(fp)
    #print(data_dict)
    h_ids = data_dict['h_ids']
    b_ids = data_dict['b_ids']
    y = data_dict['y']
    #print(h_ids)
    #print(b_ids)
    #print(y)
    print('finished loading Pickle')
    
    # Concatenated headline_bodies zero padded np matrices; seq. lengths as np vector
    # h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
    # h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)

    if concat:
        h_b_ids, h_b_np, seqlen = concatConvert_np(h_ids, b_ids)
        output_dict = {'y':y,
                       'h_b_np':h_b_np, 
                       'seqlen':seqlen}
    else:
        h_np, h_seqlen, b_np, b_seqlen = distinctConvert_np(h_ids, b_ids)
        # Find and delete empty
        ind_empty = []
        for i in range(np.shape(h_np)[0]):
            if ((h_seqlen[i] == 0) or (b_seqlen[i] == 0)):
                ind_empty.append(i)
        #print('Empty sequences: ', ind_empty)
        if (len(ind_empty) > 0):
            y = np.delete(y, ind_empty)
            h_np = np.delete(h_np, ind_empty, 0)
            b_np = np.delete(b_np, ind_empty, 0)
            h_seqlen = np.delete(h_seqlen, ind_empty)
            b_seqlen = np.delete(b_seqlen, ind_empty)
        output_dict = {'y':y,
                       'h_np':h_np, 
                       'b_np':b_np, 
                       'h_seqlen':h_seqlen,
                       'b_seqlen':b_seqlen}
    
    #Have to check this
    config.embed_size = embedding_dim
    config.pretrained_embeddings = embedding
    config.vocab_size = vocab_size
    return config, output_dict

In [59]:
if __name__ == '__main__':
    pickle_path = '/glove/twitter50d_h_ids_b_ids_pickle.p'
    load_path = base_path + pickle_path
    # config, data_dict = get_data(1028, 
            #filename_embeddings = '/glove/glove.twitter.27B.50d.txt',
           # pickle_path = '/glove/twitter50d_h_ids_b_ids_pickle.p',
            #concat = False)
    vocab, embd = save_data_pickle(load_path)


Loaded GloVe!
Reading dataset
Total stances: 49972
Total bodies: 1683
word_list <map object at 0x000002BD0D6A3EF0>
Police
2332
find
471
mass
6866
graves
31069
with
59
at
67
least
1230
15


KeyError: '15'