In [10]:
import utils
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

In [13]:
def CNN_Model(data, wv_from_bin, batch_size=128, kernel_size=5, filters=3, max_length=1000, vec_size=200):
    
    # Preparing Text Data and Label
    tweet, label = data['text'], data['sentiment']
    
    # Tokenizing Text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweet)
    sequences = tokenizer.texts_to_sequences(tweet)

    # Preparing Embedding Dictionary
    word_index = tokenizer.word_index
    tweets_pad = pad_sequences(sequences, maxlen=max_length, padding='post')

    embedding_index = {}
    words = list(wv_from_bin.vocab.keys())
    curInd = 0
    for w in words:
        try:
            embedding_index[w] = wv_from_bin.word_vec(w)
            curInd += 1
        except KeyError:
            continue
            
    # Preparing Embedding Matrix
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, vec_size))
    words = []
    for word, i in word_index.items():
        words.append(word)
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    # Preparing Embedding Layer
    embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                            vec_size,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)
    
    # Construction CNN Model Structure
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.4))
    model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(300, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(150, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Conv1D(75, kernel_size, padding='valid', activation='relu', strides=1))
    model.add(Flatten())
    model.add(Dense(600))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('tanh'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
    filepath = "/Users/colinwan/Desktop/DataFest2020/checkpoint/CNN/4cnn-{epoch:02d}-{loss:0.3f}-{mean_squared_error:0.3f}-{val_loss:0.3f}-{val_mean_squared_error:0.3f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.000001)
    print(model.summary())
    
    # Training Model
    model.fit(tweets_pad, label, batch_size=batch_size, epochs=8, validation_split=0.1, shuffle=True, callbacks=[checkpoint, reduce_lr])
    
    # Saving Trained Model
#     model.save("/Users/colinwan/Desktop/DataFest2020/Models/CNN.h5")
    
    return model

In [3]:
if __name__ == "__main__":
    wv_from_bin = utils.load_embedding_model(200)
    data = pd.read_csv('/Users/colinwan/Desktop/DataFest2020/Training Folder/Sentiment.csv')
    data = data[['text','sentiment']]
    data = data.replace('Neutral', 0)
    data = data.replace('Positive', 1)
    data = data.replace('Negative', -1)
    data = utils.clean_text(data)
    data['text'] = data['text'].str.replace('[^A-Za-z ]+', '')
    CNN_Model(data, wv_from_bin)

Loaded vocab size 400000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'][i] = " ".join([word for word in df['text'][i].split()


NameError: name 'Sequential' is not defined

In [14]:
CNN_Model(data, wv_from_bin)


Train on 12483 samples, validate on 1388 samples
Epoch 1/8
  256/12483 [..............................] - ETA: 13:19 - loss: 0.7043 - mean_squared_error: 0.7043

KeyboardInterrupt: 