# Import Package

In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import utils
import numpy as np
import pandas as pd
from keras.models import Sequential, load_model
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import LSTM
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Core Model Function

In [2]:
def LSTM_Model(data, wv_from_bin, batch_size=128, kernel_size=5, epoch=8, vec_size=200, loading=False):
    
    # Preparing Text Data and Label
    tweet, label = data['text'], data['sentiment']
    label = pd.concat([label==0, label==1, label==-1], axis=1).astype(int)

    # Tokenizing Text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweet)
    sequences = tokenizer.texts_to_sequences(tweet)
    max_length = max([len(s.split()) for s in tweet])

    # Preparing Embedding Dictionary
    word_index = tokenizer.word_index
    tweets_pad = pad_sequences(sequences, maxlen=max_length, padding='post')

    embedding_index = {}
    words = list(wv_from_bin.vocab.keys())
    curInd = 0
    for w in words:
        try:
            embedding_index[w] = wv_from_bin.word_vec(w)
            curInd += 1
        except KeyError:
            continue
            
    # Preparing Embedding Matrix
    embedding_matrix = np.zeros((len(word_index) + 1, vec_size))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    # Preparing Embedding Layer
    embedding_layer = Embedding(len(tokenizer.word_index)+1,
                            vec_size,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False
                               )
    
    # Construction LSTM Model Structure
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.4))
    model.add(LSTM(128))
    model.add(Dense(64))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(3))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
    print(model.summary())

    # Training Model
    model.fit(tweets_pad, label, batch_size=batch_size, epochs=epoch, validation_split=0.1, shuffle=True, callbacks=[reduce_lr])
    
    # Saving Trained Model
    model.save("/Users/colinwan/Desktop/DataFest2020/Models/LSTM.h5")
    
    return model

In [3]:
if __name__ == "__main__":
    # Load Glove Vector
    wv_from_bin = utils.load_embedding_model(200)
    # Load dataset
    data = pd.read_csv('/Users/colinwan/Desktop/DataFest2020/Training Folder/Sentiment.csv')
    data = data[['text','sentiment']]
    # Replace word with class id
    data = data.replace('Neutral', 0)
    data = data.replace('Positive', 1)
    data = data.replace('Negative', -1)
    data = utils.clean_text(data)
    # Filter out none alphabet character
    data['text'] = data['text'].str.replace('[^A-Za-z ]+', '')
    # Run model
    LSTM_M = LSTM_Model(data, wv_from_bin, epoch=32)

Loaded vocab size 400000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][i] = " ".join([word for word in df['text'][i].split()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 29, 200)           2492800   
_________________________________________________________________
dropout_1 (Dropout)          (None, 29, 200)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               168448    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
activation_1 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                