In [2]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

## Read Data

In [5]:
data = pd.read_excel('C:/Users/Admin/OneDrive/Desktop/INS3008-Projects/Comment-shopee.xlsx', header = None)

In [6]:
data.columns = ['Text','Label']

In [7]:
data.head()

Unnamed: 0,Text,Label
0,Chất liệu: okii\nzừi zừi đẹp không có chỗ chê ...,0
1,Chất liệu: okii\nzừi zừi đẹp không có chỗ chê ...,0
2,Chất liệu: nhung tăm\nMàu sắc: be với đen\nĐún...,0
3,Chất liệu: nhung tăm\nMàu sắc: be với đen\nĐún...,0
4,Chất liệu: Ok\nMàu sắc: đen\nĐúng với mô tả: b...,0


In [8]:
len(data)

3363

In [9]:
data.Label.unique()

array([0, 1, 2], dtype=int64)

In [10]:
data.shape

(3363, 2)

In [11]:
pos = []
neg = []
neu = []
#Pos: 0, Neg:2, Neu:1
#False: 0, True:1
for l in data.Label:
    if l == 0:
        pos.append(1)
        neg.append(0)
        neu.append(0)
    elif l == 1:
        pos.append(0)
        neg.append(0)
        neu.append(1)
    else:
        pos.append(0)
        neg.append(1)
        neu.append(0)

In [12]:
data['Pos']= pos
data['Neg']= neg
data['Neu']= neu

In [13]:
data.head()

Unnamed: 0,Text,Label,Pos,Neg,Neu
0,Chất liệu: okii\nzừi zừi đẹp không có chỗ chê ...,0,1,0,0
1,Chất liệu: okii\nzừi zừi đẹp không có chỗ chê ...,0,1,0,0
2,Chất liệu: nhung tăm\nMàu sắc: be với đen\nĐún...,0,1,0,0
3,Chất liệu: nhung tăm\nMàu sắc: be với đen\nĐún...,0,1,0,0
4,Chất liệu: Ok\nMàu sắc: đen\nĐúng với mô tả: b...,0,1,0,0


## Clean data

In [14]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', str(text))
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.Text_Clean] 

In [17]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens] 

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

In [20]:
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [21]:
filtered_words = [remove_stop_words(sen) for sen in lower_tokens] 

In [22]:
result = [' '.join(sen) for sen in filtered_words] 

In [23]:
data['Text_Final'] = result

In [24]:
data['tokens'] = filtered_words

In [25]:
data = data[['Text_Final', 'tokens', 'Label', 'Pos', 'Neg','Neu']]

In [26]:
data[:5]

Unnamed: 0,Text_Final,tokens,Label,Pos,Neg,Neu
0,chất liệu okii zừi zừi đẹp không có chỗ chê lu...,"[chất, liệu, okii, zừi, zừi, đẹp, không, có, c...",0,1,0,0
1,chất liệu okii zừi zừi đẹp không có chỗ chê lu...,"[chất, liệu, okii, zừi, zừi, đẹp, không, có, c...",0,1,0,0
2,chất liệu nhung tăm màu sắc với đen đúng với m...,"[chất, liệu, nhung, tăm, màu, sắc, với, đen, đ...",0,1,0,0
3,chất liệu nhung tăm màu sắc với đen đúng với m...,"[chất, liệu, nhung, tăm, màu, sắc, với, đen, đ...",0,1,0,0
4,chất liệu ok màu sắc đen đúng với mô tả bình t...,"[chất, liệu, ok, màu, sắc, đen, đúng, với, mô,...",0,1,0,0


## Split data

In [27]:
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [28]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

112326 words total, with a vocabulary size of 7385
Max sentence length is 175


In [29]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

12194 words total, with a vocabulary size of 1893
Max sentence length is 141


## Load Google News Word2Vec model

In [37]:
word2vec_path = 'C:/Users/Admin/OneDrive/Desktop/INS3008-Projects/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [38]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

## Get Embeddings

In [39]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [40]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

## Tokenize and Pad sequences

In [41]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 7385 unique tokens.


In [42]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [43]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(7386, 300)


In [44]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Define CNN

In [45]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [46]:
label_names = ['Pos', 'Neg','Neu']

In [47]:
y_train = data_train[label_names].values

In [62]:
y_test = data_test[label_names].values

In [48]:
x_train = train_cnn_data
y_tr = y_train

In [63]:
x_test = test_cnn_data

In [49]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 300)      2215800     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 49, 200)      120200      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 48, 200)      180200      embedding[0][0]                  
______________________________________________________________________________________________

# Train CNN

In [50]:
num_epochs = 3
batch_size = 34

In [51]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Test CNN

In [59]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [64]:
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 89.317507


In [61]:
predictions

array([[9.99802470e-01, 6.66129881e-06, 7.44650752e-05],
       [8.75429511e-01, 1.97670162e-02, 4.44633067e-02],
       [9.99549389e-01, 1.64196044e-05, 2.11119652e-04],
       ...,
       [9.33412790e-01, 9.58180428e-03, 1.86868310e-02],
       [9.97795105e-01, 1.37835741e-04, 5.60373068e-04],
       [6.64095402e-01, 1.02987856e-01, 1.50198162e-01]], dtype=float32)

In [53]:
labels = [1, 0, 2]

In [54]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [55]:
sum(data_test.Label==prediction_labels)/len(prediction_labels)

0.08902077151335312

In [56]:
data_test.Label.value_counts()

0    301
1     30
2      6
Name: Label, dtype: int64