In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os,sys

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.models import Model
from keras.layers import Input,Embedding,Dense
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam

from sklearn.metrics import roc_auc_score,log_loss

import keras.backend as K
stop_list = stopwords.words('english')

In [None]:
# some configuration
MAX_SEQUENCE_LENGTH = 280
MAX_VOCAB_SIZE = 40000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.3
BATCH_SIZE = 16
EPOCHS = 4


In [None]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(r'../input/glove6b/glove.6B.300d.txt') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

In [None]:
def clean_text(text,stop_list):
    '''
    clean the given text
    :param : input text string
    :param : list of string
    :param : output text string
    '''
    text = text.lower()
    text = re.sub("\d","",text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text =" ".join([w for w in word_tokenize(text) if not w in stop_list ])
    return text 

In [None]:
# prepare text samples and their labels
print('Loading in comments...')

train = pd.read_csv(r"../input/mh-sentiment-analysis/train.csv")
target_dummies = pd.get_dummies(train.Sentiment)
train = train.join(target_dummies,how='outer')
train.drop(['ID','author','Sentiment'],axis=1,inplace=True)
train['Review'] = train['Review'].apply(lambda x: clean_text(x,stop_list))
sentences = train["Review"].fillna("DUMMY_VALUE").values
possible_labels = [0, 1, 2]
targets = train[possible_labels].values

In [None]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))


# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

In [None]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

In [None]:
print('Building model...')

# create an LSTM network with a single LSTM
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
# x = LSTM(15, return_sequences=True)(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(len(possible_labels), activation="sigmoid")(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer=Adam(lr=0.001),
  metrics=['accuracy'],
)


In [None]:
print('Training model...')
r = model.fit(
  data,
  targets,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [None]:
p = model.predict(data)
aucs = []
LogL =[]
for j in range(3):
    auc = roc_auc_score(targets[:,j], p[:,j])
    log_loss_1 = log_loss(targets[:,j], p[:,j])
    aucs.append(auc)
    LogL.append(log_loss_1)
print(np.mean(aucs))
print(np.mean(LogL))

## Test Prediction

In [None]:
test = pd.read_csv(r'../input/mh-sentiment-analysis/test.csv')
test['Review'] = test['Review'].apply(lambda x :clean_text(x,stop_list))
test_sentence = test['Review'].fillna("DUMMY_VALUE").values
tokenizer.fit_on_texts(test_sentence)
test_sequences = tokenizer.texts_to_sequences(test_sentence)

In [None]:
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))
# pad sequences so that we get a N x T matrix
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

In [None]:
final_prediction = model.predict(test_data)

## Submission File

In [None]:
submission = pd.read_csv(r'../input/mh-sentiment-analysis/submission.csv')
submission['Negative_0'] = final_prediction[:,0]
submission['Neutral_1'] = final_prediction[:,1]
submission['Positive_2'] = final_prediction[:,2]
submission.to_csv("LSTM_Model_V2.csv",index=False)