In [82]:
import json
from sklearn.preprocessing import MultiLabelBinarizer
from numpy.random import shuffle
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, \
Dropout, Activation, Input, Flatten, Concatenate
from keras.models import Model

import matplotlib.pyplot as plt
import pandas as pd

MAX_NB_WORDS = 7500
MAX_DOC_LEN = 100
EMBEDDING_DIM = 100


def find_avg_wordlen(text):
    print("Average length of sentences is 90")
    a = np.asarray(text)
    c = np.array([])
    for i in a:
        b = len(i.split())
        c = np.append(c, b)
    print(np.mean(c))


def build_tokenizer(text):
    # documents are quite long in the dataset

    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    voc = tokenizer.word_index
    # convert each document to a list of word index as a sequence
    sequences = tokenizer.texts_to_sequences(text)
    # get the mapping between words to word index

    # pad all sequences into the same length (the longest)
    padded_sequences = pad_sequences(sequences, \
                                     maxlen=MAX_DOC_LEN, \
                                     padding='post', truncating='post')

    # print(padded_sequences[6])
    return padded_sequences

def build_model():

    # define input layer, where a sentence represented as
    # 1 dimension array with integers
    main_input = Input(shape=(MAX_DOC_LEN,), dtype='int32', name='main_input')

    # define the embedding layer
    # input_dim is the size of all words +1
    # where 1 is for the padding symbol
    # output_dim is the word vector dimension
    # input_length is the max. length of a document
    # input to embedding layer is the "main_input" layer
    embed_1 = Embedding(input_dim=MAX_NB_WORDS + 1,
                        output_dim=EMBEDDING_DIM,
                        input_length=MAX_DOC_LEN,
                        name='embedding')(main_input)

    conv1d_1 = Conv1D(filters=32, kernel_size=1,
                      name='conv_unigram',
                      activation='relu')(embed_1)

    pool_1 = MaxPooling1D(MAX_DOC_LEN - 1 + 1, \
                          name='pool_unigram')(conv1d_1)

    flat_1 = Flatten(name='flat_unigram')(pool_1)

    conv1d_2 = Conv1D(filters=32, kernel_size=2, \
                      name='conv_bigram', \
                      activation='relu')(embed_1)
    pool_2 = MaxPooling1D(MAX_DOC_LEN - 2 + 1, name='pool_bigram')(conv1d_2)
    flat_2 = Flatten(name='flat_bigram')(pool_2)

    conv1d_3 = Conv1D(filters=32, kernel_size=3, \
                      name='conv_trigram', activation='relu')(embed_1)
    pool_3 = MaxPooling1D(MAX_DOC_LEN - 3 + 1, name='pool_trigram')(conv1d_3)
    flat_3 = Flatten(name='flat_trigram')(pool_3)

    z = Concatenate(name='concate')([flat_1, flat_2, flat_3])

    # Create a dropout layer
    # In each iteration only 50% units are turned on
    drop_1 = Dropout(rate=0.5, name='dropout')(z)

    # Create a dense layer
    dense_1 = Dense(96, activation='relu')(drop_1)
#     drop_1 = Dropout(rate=0.5)(dense_1)
#     dense_1 = Dense(100, activation='relu')(drop_1)
#     dense_1 = Dense(100, activation='relu')(dense_1)
    
#     dense_1 = Dense(128, activation='relu')(dense_1)
    # Create the output layer
    preds = Dense(5, activation='softmax', name='output')(dense_1)

    # create the model with input layer
    # and the output layer
    model = Model(inputs=main_input, outputs=preds)

    return model

if __name__ == "__main__":
    df = pd.read_csv("labeled_il_reviews_large.csv", header = 0)
    # print(semEval_df.head())

    text = df['text'].values
    tags = df['tags'].values
    print(len(text))
    labels = []

    for x in tags:
        tag = x.split(", ")   # tags are separated by comma and space. Eg, Cleanliness, Food
        labels.append(tag)

    data = list(zip(*(text, labels)))

    text = [x[0] for x in data]
    find_avg_wordlen(text)
    mlb = MultiLabelBinarizer()
    Y = mlb.fit_transform(labels)
    print(mlb.classes_)
    print(np.sum(Y, axis=0))

    padded_sequences = build_tokenizer(text)

    # X_train, X_test, y_train, y_test = train_test_split(\
    #                         padded_sequences, Y,\
    #                         test_size=0.3, random_state=1)
    
    X_train = padded_sequences[:320]
    X_val = padded_sequences[320:370]
    X_test = padded_sequences[370:]
    
    y_train = Y[:320]
    y_val = Y[320:370]
    y_test = Y[370:]
    
    cnn_model = build_model()
    print(cnn_model.summary())

    cnn_model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["accuracy"])
    BATCH_SIZE = 64
    NUM_EPOCHES = 50

    # fit the model and save fitting history to "training"
    training = cnn_model.fit(X_train, y_train, \
                         batch_size=BATCH_SIZE, \
                         epochs=NUM_EPOCHES, \
                         validation_data=[X_val, y_val])
    
    pred=cnn_model.predict(X_test)
    # evaluate the model
    scores = cnn_model.evaluate(X_test, y_test, verbose=0)
    print("%s: %.2f%%" % (cnn_model.metrics_names[1], scores[1]*100))
    
#     df = pd.DataFrame.from_dict(training.history)
#     df.columns = ["train_acc", "train_loss", \
#                   "val_acc", "val_loss"]
#     df.index.name = 'epoch'
#     print(df)

#     # plot training history
#     fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 3));

#     df[["train_acc", "val_acc"]].plot(ax=axes[0]);
#     df[["train_loss", "val_loss"]].plot(ax=axes[1]);
#     plt.show();

413
Average length of sentences is 90
91.57869249394673
['Ambience' 'Cleanliness' 'Food' 'None' 'Service']
[ 65  45 365  17 212]
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 100)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 100)     750100      main_input[0][0]                 
__________________________________________________________________________________________________
conv_unigram (Conv1D)           (None, 100, 32)      3232        embedding[0][0]                  
__________________________________________________________________________________________________
conv_bigram (Conv1D)            (None, 99, 32)       6432        embedding[0][0

Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
acc: 79.53%


In [125]:
import numpy as np
p = [[[9.6844585e-04, 3.6410633e-02, 1.6466923e-03, 9.5035657e-03, 3.2031160e-02,
   4.1820608e-02, 2.0942863e-03, 2.9957650e-04, 3.8437967e-03, 8.0282564e-07,
   7.2959522e-03, 5.0763118e-01, 5.2764505e-02, 5.1746462e-03, 3.1874825e-03,
   2.2540364e-01, 6.6833437e-04, 2.0476615e-04, 3.2938635e-03, 3.3450838e-02,
   9.9030335e-04, 8.0543815e-04, 4.4468492e-05, 2.9199455e-02, 1.9802197e-04,
   3.2074266e-05, 4.2012878e-05, 6.2628539e-04, 3.6239010e-04, 4.7610620e-06]]]
p = p[0, 0,:]
p = np.power(p, (1/0.5))
p = p/np.sum(p)
p = p[0,0,:]
prob = np.random.multinomial(1, p, 1)
print(p)
print(prob)
print(np.argmax(prob))

TypeError: list indices must be integers or slices, not tuple

In [126]:
temperature = 0.5
a = [[[9.6844585e-04, 3.6410633e-02, 1.6466923e-03, 9.5035657e-03, 3.2031160e-02,
   4.1820608e-02, 2.0942863e-03, 2.9957650e-04, 3.8437967e-03, 8.0282564e-07,
   7.2959522e-03, 5.0763118e-01, 5.2764505e-02, 5.1746462e-03, 3.1874825e-03,
   2.2540364e-01, 6.6833437e-04, 2.0476615e-04, 3.2938635e-03, 3.3450838e-02,
   9.9030335e-04, 8.0543815e-04, 4.4468492e-05, 2.9199455e-02, 1.9802197e-04,
   3.2074266e-05, 4.2012878e-05, 6.2628539e-04, 3.6239010e-04, 4.7610620e-06]]]
a = np.log(a) / temperature 
a = a[0,0,:]
dist = np.exp(a)/np.sum(np.exp(a)) 
choices = range(len(a)) 
np.random.choice(choices, p=dist)

11