## This notebook adds POS labels for concatenation with input for CNN.
It is a proof of concept only.

In [1]:
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [2]:
import numpy as np
import pandas as pd
import gensim
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D 
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer

Using TensorFlow backend.


## load data

### load train file

In [7]:
# load in our restaurant tsv - the tsv col - word is our string.
path= '/media/peter/BigExternal/Study/Vu/Thesis/code/testing/aspect_extraction/data/2014_rest/'
file_train = 'restaurants_train.tsv'
df_train = pd.read_csv(path+file_train, sep = '\t')

### load test data

In [8]:
# load in test data 2014 - gold
file_test = 'restaurants_gold.tsv'
df_test = pd.read_csv(path+file_test, sep = '\t')

## Embeddings

### load general embeddings

In [9]:
path_to_w2v = '/media/peter/BigExternal/Study/Vu/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'
w2v_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(path_to_w2v,binary=True)

### domain_embeddings: hsu's fastText model

In [10]:
# these are hsu's 100dim
path_to_xu_model = '/media/peter/BigExternal/Study/Vu/Thesis/code/testing/aspect_extraction/DE-CNN-master/data/embedding/restaurant_emb.vec.bin'
xu_model = gensim.models.fasttext.load_facebook_model(path_to_xu_model) 

## functions to substitute file for  embeds 

In [11]:
#substitute word for embedding vector
#substitute zeroes for unknown 
def embed_swap (word_embedding_model, dataframe_series):
    '''takes a word embedding model and a dataframe series or words
    returns a list of embeddings in the same order as the series.
    '''
    embeddings = []
    for token in dataframe_series:
        if token in word_embedding_model:
            vector = word_embedding_model[token]
        else:
            vector = [0]*300
        embeddings.append(vector)
    return embeddings

### function to make labels categorical

In [12]:
#labels to categorical
def labels_to_categorical (dataframe_series):
    '''changes a series of labels into numerical categories and a dictionary to look up the cats
    
    '''
    labels = dataframe_series.tolist()
    label_set = set()
    for label in labels:
        label_set.add(label)
    label2Idx = {}
    for label in label_set:
        label2Idx[label] = len(label2Idx)
    map_prep = pd.DataFrame(labels)
    mapped = list(map_prep[0].map(label2Idx))
    Y_label = np.asarray(mapped)

    label2Idx
    return Y_label, label2Idx

# Syntax

In [13]:
def dicts_n_labels(df, features):
    '''
    takes a dataframe and a user-generated list of features as input
    returns a dictionary of features ready for vecotization, a list of NER labels
    and a list of token from the dataframe
    '''
    no_label = df[features]
    dict_for_vec = no_label.to_dict('records')
    labels = list(df['label'])

    return dict_for_vec, labels

In [14]:
def dict_vectorizer_embed(training_dict, gold_dict):
    '''
    a function that takes two dictionaries of CoNLL data (training and test)
    allows for embeddings
    Returns vectors usable as input for machine learning calculations
    '''
    v = DictVectorizer()
    training_vec = v.fit_transform(training_dict)
    test_vec = v.transform(gold_dict)
    test_array = test_vec.toarray()
    training_array = training_vec.toarray()

    return training_vec, test_vec, training_array, test_array

In [15]:
def concat_arrays(feature_array, embedding_list, domain_embeds):
    '''
    a feature for embeddings path which concatenates an array of features and
    a list of feature vectors
    returns one list of concatenated feature vectors and associated embeddings

    '''
    num_words = feature_array.shape[0]
    concat_input = []  # for storing the result of concatenating
    for index in range(num_words):
        # concatenate features per word
        representation = list(embedding_list[index]) + list(domain_embeds[index])+list(feature_array[index])
#         representation = list(embedding_list[index]) + list(feature_array[index])

        concat_input.append(representation)
    return concat_input

## adding additional pos information

In [17]:
features = ['xpos']

training_dict, labels = dicts_n_labels(df_train, features)
gold_dict, labels = dicts_n_labels(df_test, features)

In [18]:
def training_with_syntax (general_embed_model, custom_embed_model, dataframe_series, label_series, training_dict, gold_dict, gold_series, gold_label):
    '''
    
    '''
    embeddings = embed_swap (general_embed_model, dataframe_series)
    domain_embed = embed_swap (custom_embed_model, dataframe_series.fillna('na'))       
    training_vec, test_vec, training_array, test_array = dict_vectorizer_embed(training_dict, gold_dict)
    concat_training = concat_arrays(training_array, embeddings, domain_embed)
    x_train = np.array(embeddings)
    x_train= np.reshape(x_train, (1, x_train.shape[0], x_train.shape[1]))
    X_tr = np.array(concat_training)
    X_tr = np.reshape(X_tr, (1, X_tr.shape[0], X_tr.shape[1]))
    X_train = X_tr[:,:23763,:]
    
    X_valid = X_tr[:,23763:,:]
    
    print('shape: x_valid :', X_valid.shape)
    print('shape: X_train :', X_train.shape)  
    train_labels, label_index = labels_to_categorical(label_series)
    y_train = keras.utils.np_utils.to_categorical(train_labels)
    y_train = np.reshape(y_train,(1,y_train.shape[0], y_train.shape[1]))
    y_valid = y_train[:,23763:,:]
    
    y_train = y_train[:,:23763,:]
    
    print('shape: y_valid :', y_valid.shape)
    print('shape: y_train :', y_train.shape) 
    test_embeddings = embed_swap (general_embed_model, gold_series)
    domain_test = embed_swap (custom_embed_model, gold_series.fillna('na'))
    
    concat_test = concat_arrays(test_array, test_embeddings, domain_test)
    test_labels, label_index = labels_to_categorical(gold_label)
    y_test = keras.utils.np_utils.to_categorical(test_labels)
    X_te = np.array(concat_test)
    X_te = np.reshape(X_te, (1, X_te.shape[0], X_te.shape[1]))
    print('shape: X_test :', X_te.shape)    
    return X_train, y_train, X_valid, y_valid, X_te, test_labels, label_index


In [20]:
X_train,  y_train, X_valid, y_valid, X_test, test_labels, label_index = training_with_syntax (w2v_embedding_model, xu_model, df_train['word'], df_train['label'], training_dict, gold_dict, df_test['word'], df_test['label'])

  if token in word_embedding_model:
  vector = word_embedding_model[token]


shape: x_valid : (1, 23763, 446)
shape: X_train : (1, 23763, 446)
shape: y_valid : (1, 23763, 3)
shape: y_train : (1, 23763, 3)
shape: X_test : (1, 12752, 446)


# CNN model

In [21]:
batch_size = 128
embedding_dims = X_train.shape[2]
kernel_size = 5
epochs = 200

In [22]:
# matching hsu

model = Sequential()
model.add(Conv1D(filters = 128, kernel_size = kernel_size,padding='same', activation='relu',strides=1,input_shape = (None, embedding_dims)))
model.add(Dropout(0.55,))
model.add(Conv1D(filters = 128, kernel_size = kernel_size, padding='same', activation='relu'))
model.add(Conv1D(filters = 256, kernel_size = kernel_size, padding='same'))
model.add(Conv1D(filters = 256, kernel_size = kernel_size, padding='same'))
# model.add(Conv1D(filters = 256, kernel_size = kernel_size, padding='same'))
model.add(Dense(units=3, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data = (X_valid, y_valid), verbose = False)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, None, 128)         285568    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         82048     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 256)         164096    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 256)         327936    
_________________________________________________________________
dense_1 (Dense)              (None, None, 3)           771       
Total params: 860,419
Trainable params: 860,419
Non-trainable params: 0
________________________________________________

<keras.callbacks.callbacks.History at 0x7f7a5728adf0>

# Predict

run 5 times and take mean

In [25]:
output_list = model.predict_classes(X_test)
#change df_gold where relevant
gold_series = df_test.label.tolist()
system_predictions = output_list[0]
d = {'gold':gold_series,'predicted':system_predictions}
result = pd.DataFrame(d)
inv_map = {v: k for k, v in label_index.items()}
predictions = result.predicted.map(inv_map)
df_test['predicted'] = predictions

In [27]:
target_names = ['B','I','O']
print(classification_report(result.gold, df_test.predicted, target_names=target_names))
df_test['gold_bi']= result.gold.replace({'I': 'B'})
df_test['predicted_bi']= df_test.predicted.replace({'I': 'B'})
target_names = ['B','O']
print(classification_report(df_test.gold_bi, df_test.predicted_bi, target_names=target_names))

              precision    recall  f1-score   support

           B       0.85      0.88      0.86      1132
           I       0.89      0.56      0.69       571
           O       0.98      0.99      0.98     11049

    accuracy                           0.96     12752
   macro avg       0.90      0.81      0.85     12752
weighted avg       0.96      0.96      0.96     12752

              precision    recall  f1-score   support

           B       0.93      0.84      0.88      1703
           O       0.98      0.99      0.98     11049

    accuracy                           0.97     12752
   macro avg       0.95      0.91      0.93     12752
weighted avg       0.97      0.97      0.97     12752

