In [46]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.backend as K
from nltk import word_tokenize
from collections import Counter
import re, string

#from transformers import *
#import tokenizers
from IPython.core.display import display, HTML
print('TensorFlow',tf.__version__)
#nltk.download('punkt')

TensorFlow 1.14.0


In [417]:
train = pd.read_csv('../input/train.csv').fillna('')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [418]:
test = pd.read_csv('../input/test.csv').fillna('')
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [629]:
def load_data(data):
    """
    Load the training data into the training format
    remove punctuation and return a list of tokens
    """
    # Removing excess punctuation and newline
    #pattern = re.compile('[%s]' % re.escape(string.punctuation))
    #data = pd.Series([pattern.sub('', h.strip("\n")).split(' ') for h in data])
    data = pd.Series([h.lower().split(' ') for h in data])
    #data = [word_tokenize(h) for h in data]
    return data
def remove_neutral(data):
    data = data[data['sentiment'] != 'neutral']
    return data

In [591]:
train['tokens'] = load_data(train['text'])
test['tokens'] = load_data(test['text'])

In [592]:
END = '<END>'
UNK = '<UNK>'
def gen_vocab(dataset, min_token_ct=0):
    """
    For given training data, list of vocabulary list, i.g.
    [["this", "set", "1"],
     ["this", "is", "another", "set"],
     ]
     
    return the vocab list and rev_vocab dictionary
    3 numerical encodings are reserved: {<UNK>:0, <START>:1, <END>:2}
    """
    token_ct = Counter([token for row in dataset for token in row])
    token_ct = {k: v for k, v in token_ct.items() if v >= min_token_ct}
    vocab = sorted(token_ct, key=token_ct.get, reverse=True)
    vocab = vocab + [UNK, END]
    rev_vocab = {fea: fid for fid, fea in enumerate(vocab)}
    
    return vocab, rev_vocab

In [593]:
def load_embedding(filename, vocab=None):
    """
    Load the embedding file into a pandas DF
    
    If a vocab set is provided, only return the subset in the vocab list, if tokens
    in the vocab list is not present in the embedding, use randomalized value
    """
    embedding = pd.read_csv(filename)
    if vocab:
        m = []
        normalize = (embedding**2).sum(axis=0).mean()
        embedding_dim = embedding.shape[0]
        
        for t in vocab:
            v = embedding.get(t)
            if v is None:
                v0 = np.random.rand(embedding.shape[0]) - 0.5
                # apply normalization so the expected module is equal to the 
                # average module of the embedding matrix
                v = v0 * 2 * np.random.rand() * np.sqrt(normalize / (v0**2).sum())
                
            m.append(v)
            
        embedding = pd.DataFrame(m , index=vocab)
    
    return embedding

In [594]:
def to_embedding(X):
    """
    For the 2 dimensional input X filled with the vocabulary label, return an np.array of their embedding
    input:
    X: np.array(n_sample, sent_len)
                return:
                    embdding
    """
    embedding = np.zeros((len(X), len(X[0]), glove.shape[1]))
    for i in range(len(X)):
        for j in range(len(X[0])):
            embedding[i,j,:] = glove[X[i][j]]
    return embedding

In [595]:
vocab, re_vocab = gen_vocab(train['tokens'], 4)

In [596]:
glove = load_embedding('../input/glove_6B_100d_top100k.csv', vocab=vocab)
glove.T.head()
glove = glove.values

In [630]:
MAX_LEN = 96
#sentiment_tar = {'positive': 1, 'negative': -1, 'neutral': 0}
sentiment_tar = {'positive': 1, 'negative': 0}

In [631]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical 
import random

def to_label_X(token):
    return re_vocab.get(token, re_vocab[UNK])
    
def to_label_Y(sentiment):
    return sentiment_tar[sentiment]
    
def gen_sample_RNN(data, batch_size=100, one_hot=True):
    """
    The input is the same to the FNN model, but the output training data is different.
    inputs:
    data: list of list of string batch_size: int
    one_hot: boolean
    output:
    X: np.array(batch_size, sent_len, embedding_dim)
    Y: np.array(batch_size, sent_len, ) or np.array(batch_size, sent_len,
    vocab_size) 
    """
    if batch_size == -1: batch_size = len(data)
    while True:
        # Shuffle the data so data order is different for different epochs random.shuffle(data)
        X, Y = [], [] 
        for index, s in data.iterrows():
            X.append([to_label_X(t) for t in s['tokens']]) 
            Y.append(to_label_Y(s['sentiment']))
            if len(X) >= batch_size:
                X = pad_sequences(sequences=X, maxlen=MAX_LEN, padding='post', value=to_label_X(END))
                if one_hot:
                    Y = to_categorical(Y, num_classes=len(sentiment_tar))
                #yield to_embedding(X), Y
                yield X, Y
                X, Y = [], []

In [632]:
X_dev_RNN, y_dev_RNN = next(gen_sample_RNN(remove_neutral(test), batch_size=-1, one_hot=True))

In [674]:
from keras.layers import Dense, LSTM, Activation, TimeDistributed, Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional
from keras.models import Sequential
from keras.layers import Dropout

RNN_pred_model = Sequential()
#RNN_pred_model.add(LSTM(128, input_shape=(MAX_LEN, glove.shape[1]), return_sequences=True))
RNN_pred_model.add(Embedding(input_dim=len(vocab), weights=[glove],
                             input_length=MAX_LEN, output_dim=glove.shape[1]))
RNN_pred_model.add(Dropout(0.4))
#RNN_pred_model.add(Bidirectional(LSTM(128, input_shape=(MAX_LEN, glove.shape[1]), return_sequences=True)))
#RNN_pred_model.add(Bidirectional(LSTM(128, input_shape=(MAX_LEN, glove.shape[1]), return_sequences=True)))
RNN_pred_model.add(Dense(100, activation="relu"))
RNN_pred_model.add(Conv1D(filters=300, kernel_size=4, activation="relu")) 
RNN_pred_model.add(GlobalMaxPooling1D())
RNN_pred_model.add(Dense(len(sentiment_tar), activation='softmax'))
RNN_pred_model.summary()

Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_22 (Embedding)     (None, 96, 100)           632400    
_________________________________________________________________
dropout_12 (Dropout)         (None, 96, 100)           0         
_________________________________________________________________
dense_49 (Dense)             (None, 96, 100)           10100     
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 93, 300)           120300    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 300)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 2)                 602       
Total params: 763,402
Trainable params: 763,402
Non-trainable params: 0
_______________________________________________

In [675]:
from keras.optimizers import Adam
adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)

RNN_pred_model.compile(
    loss='categorical_crossentropy',
    optimizer=adam,
    metrics=['accuracy'])
batch_size = 64
steps_per_epoch = len(train) / batch_size + 1 
RNN_pred_model.fit_generator(
    gen_sample_RNN(remove_neutral(train), batch_size=batch_size, one_hot=True),
    validation_data=(X_dev_RNN, y_dev_RNN),
    epochs = 10, steps_per_epoch=steps_per_epoch)
    #callbacks=[LambdaCallback(on_epoch_end=on_epoch_end_RNN)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x150c20400>

In [682]:
def seqtosent(model, text):
    X = [[to_label_X(t) for t in text]]
    #X = to_embedding([X])
    X = pad_sequences(sequences=X, maxlen=MAX_LEN, padding='post', value=to_label_X(END))
    score = np.dot([0,1], model.predict(X)[0])
    return score

def select_text(model, text, sentiment):
    if sentiment == 'neutral':
        return text
    s = len(text)
    y = to_label_Y(sentiment)
    if (seqtosent(model, text)-0.5) * (y - 0.5) < 0 or s > 14:
        return text
    scores = {}
    #for p in range(1,s+1):
    #    text_sub = text[:p]
    #    scores.append(seqtosent(model, text_sub))
    
    start, l = 0, s
    for start in range(0,s):
        for l in range(1,s+1):
            text_sub = text[start:start+l]
            scores[(start,l)] = seqtosent(model, text_sub)
    if y == 1:
        start, l = max(scores, key=scores.get)
    if y == 0:
        start, l = min(scores, key=scores.get)

    return text[start:start+l]

In [683]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    if len(a) == 0 and len(b) == 0:
        return 1
    return float(len(c)) / (len(a) + len(b) - len(c))

In [687]:
def calc_jaccard_baseline(data):
    score = 0
    for index, s in data.iterrows():
        text = s['tokens']
        label = s['sentiment']
        selected = text
        selected = ' '.join(selected)
        jac = jaccard(s['selected_text'], selected)
        score += jac
    return score / len(data)

def calc_jaccard(data):
    score = 0
    for index, s in data.iterrows():
        text = s['tokens']
        label = s['sentiment']
        selected = select_text(RNN_pred_model,text,label)
        selected = ' '.join(selected)
        #print(s['selected_text'])
        #print(selected)
        jac = jaccard(s['selected_text'], selected)
        score += jac
        #print(jac)
    return score / len(data)

In [688]:
calc_jaccard(train)

0.592234932898034

In [689]:
calc_jaccard_baseline(train[train['sentiment']=='neutral'])

0.9764467881939682

In [690]:
calc_jaccard(remove_neutral(train))

0.3311784391511351