# Data Analysis

## Import packages

In [1]:
import os

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.utils import plot_model
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Embedding, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint

## Define constants

In [2]:
MAX_VOCABULARY_NUM = 62000
INPUT_SEQUENCE_LENGTH = 1600
EMBEDDING_DIM = 100
LABEL_NUM = 3

## Read data

In [3]:
cleanedsrc = 'data/cleaned_data/'
src = 'data/train.csv'

In [4]:
_df = pd.read_csv(f'{cleanedsrc}train_sol2415.csv')
_df.sample(10)

Unnamed: 0,review_id,review,rating
137165,137165,very good value for money like most districts ...,5
30633,30633,good product quality product quality good pric...,3
118020,118020,packaging containers once with bubble wrap fas...,5
143317,143317,good product quality price muraahh cepeett del...,5
74709,74709,the product quality is excellent the original ...,4
33017,33017,fairly cheap though photocopying delivery is a...,3
144240,144240,fall bosko,5
19146,19146,order now 9 baht products to meet the demand o...,2
124528,124528,product quality mah ga ya need to be asked aga...,5
80780,80780,alhamdulillah already the dam until the goods ...,4


In [5]:
df = _df.copy()
df.review = df.review.astype(str)

## Transform data and labels into machine-recognizable data

In [6]:
# rating to index
df['label'] = [0]*len(df)
df.loc[df[df['rating'] == 4].index, 'label'] = 1
df.loc[df[df['rating'] == 5].index, 'label'] = 2
df

Unnamed: 0,review_id,review,rating,label
0,0,ga disappointed neat products meletot hilsnyaa...,1,0
1,1,rdtanya replace broken glass broken chargernya,1,0
2,2,nyesel bngt dsni shopping antecedent photo mes...,1,0
3,3,sent a light blue suit goods ga want a refund,1,0
4,4,pendants came with dents and scratches on its ...,1,0
...,...,...,...,...
146806,146806,excellent product quality delivery speed is ve...,5,2
146807,146807,thanks gan,5,2
146808,146808,awesome awesome quality merchandise value cp v...,5,2
146809,146809,nice packing boxes made effective price fast s...,5,2


In [7]:
texts = df['review'].tolist()
labels = df['label'].tolist()
print(texts[:5])
print(labels[:5])

['ga disappointed neat products meletot hilsnyaa speed of delivery is good', 'rdtanya replace broken glass broken chargernya', 'nyesel bngt dsni shopping antecedent photo message pictures gk according fotodi existing collagen super fit nyampe holo my house open ehh collagen contents even in the face pdahal jg description super existing collagen originalnyapas writing my check lg in photo captions already ma the change ma pictures that the face', 'sent a light blue suit goods ga want a refund', 'pendants came with dents and scratches on its surface the coating looks like it will change colour quickly']
[0, 0, 0, 0, 0]


## Random partial data selection
<span style="color:#FF0000"><i class="fa fa-exclamation-circle"></i>
 To fit the whole data, please skip this cell</span>

In [8]:
text_arr = np.array(train_texts)
label_arr = np.array(train_labels)

# 隨機挑4萬筆出來訓練
random_index = np.random.choice([i for i in range(len(df))], 40000, replace=False)
train_texts = text_arr[random_index].tolist()
train_labels = label_arr[random_index].tolist()

NameError: name 'train_texts' is not defined

## Data processing
將資料處理成電腦能識別的資料型態

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_sequences = tokenizer.texts_to_sequences(texts)

word_id_dict = tokenizer.word_index
print('Total vocabulary numbers: ', len(word_id_dict))
seq_maxlen = max([len(ws) for ws in word_sequences])
print('Max sequence length: ', seq_maxlen)

Total vocabulary numbers:  61174
Max sequence length:  1541


In [10]:
# padding sequences by post method
word_sequences = pad_sequences(word_sequences, padding='post', maxlen=INPUT_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))

## Split train-test data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(word_sequences, labels, test_size=0.2)
print(len(x_train))
print(len(x_test))

117448
29363


In [23]:
y_train

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

## Construct the embedding layer

In [13]:
embedding_dict = {}

with open('data/lib/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

f.close()

In [14]:
unknown_words = []
embedding_matrix = np.zeros((len(word_id_dict) + 1, EMBEDDING_DIM))

for word, i in word_id_dict.items():
    embedding_vec = embedding_dict.get(word)
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec
    else:
        unknown_words.append(word)

print('There are totally %d unknown words in data.' % len(unknown_words))

There are totally 38338 unknown words in data.


## Customize our word vector

In [15]:
unknown_words

['cepet',
 'alhamdulillah',
 'shopee',
 'reallyt',
 'udh',
 'pesen',
 'sukaa',
 'nyampe',
 'recomended',
 'packingnya',
 'tebel',
 'dateng',
 'dapet',
 'krn',
 'okee',
 'nyesel',
 'mksh',
 'mantapp',
 'trimakasih',
 'baguuss',
 'smoga',
 'kirain',
 'pokonya',
 'mantul',
 'mantull',
 'bgtt',
 'puass',
 'sellernya',
 'pdhl',
 'baikk',
 'smpe',
 'makasihh',
 'cepatt',
 'bngt',
 'wrna',
 'mantab',
 'nyaa',
 'smpai',
 'realpict',
 'tetep',
 'hrga',
 'bangeett',
 'baguus',
 'mudah2an',
 'sempet',
 'dehh',
 'lamaa',
 'okp',
 'lagii',
 'jga',
 'rekomended',
 'brang',
 'sekalii',
 'bangeet',
 'doang',
 'sampenya',
 'dlu',
 'lahh',
 'facepalming',
 'syukaa',
 'packingan',
 'nerawang',
 'smua',
 'krna',
 'nyampenya',
 'hehehe',
 'sukak',
 'terimakasihh',
 'jugaa',
 'brng',
 'ceaa',
 'mksih',
 'lucuu',
 'dsni',
 'gapapa',
 'jgn',
 'jnt',
 'murahh',
 'terbaikk',
 'aamiin',
 'sampaii',
 'bener2',
 'huhu',
 'pke',
 'thankyouu',
 'syuka',
 'sist',
 'gaada',
 'youu',
 'blnja',
 'sihh',
 'packagingnya',

In [16]:
from gensim.models import Word2Vec
import multiprocessing

word2vec_model = Word2Vec(sentences=unknown_words, 
                 sg=1, 
                 size=100, 
                 min_count=1,
                 workers=multiprocessing.cpu_count())

In [17]:
len(word2vec_model.wv.vocab.keys())

358

可以看到只有1116個未知詞彙嵌入了個vector  
目前還在解決大量未知詞彙無法嵌入vector的問題，只好把這些未知詞彙跟著模型一起訓練

In [18]:
embedding_layer = Embedding(input_dim=len(word_id_dict)+1, 
                           output_dim=EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAX_VOCABULARY_NUM, 
                           trainable=True)

## Construct CNN model

In [21]:
# increase the filters amount and layers amount
# use adam optimizer
# change the activation function of classify layer to sigmoid function
seq_input = Input(shape=(INPUT_SEQUENCE_LENGTH,), dtype='int32')
embedding_seq = embedding_layer(seq_input)

conv_layer1 = Conv1D(256, 5, activation='relu')(embedding_seq)
pool_layer1 = MaxPooling1D(pool_size=3, strides=2)(conv_layer1)
normal_layer = BatchNormalization(axis=1, epsilon=0.0001)(pool_layer1)

conv_layer2 = Conv1D(256, 5, activation='relu')(normal_layer)
pool_layer2 = MaxPooling1D(pool_size=3, strides=2)(conv_layer2)

conv_layer3 = Conv1D(256, 5, activation='relu')(pool_layer2)
pool_layer3 = MaxPooling1D(pool_size=3, strides=2)(conv_layer3)

conv_layer4 = Conv1D(256, 5, activation='relu')(pool_layer3)
pool_layer4 = MaxPooling1D(pool_size=3)(conv_layer4)

conv_layer5 = Conv1D(256, 5, activation='relu')(pool_layer4)
gpool_layer = GlobalMaxPooling1D()(conv_layer5)

drop_layer1 = Dropout(.1)(gpool_layer)

flatten_layer = Flatten()(drop_layer1)
dense_layer = Dense(256, activation='relu')(flatten_layer)
drop_layer2 = Dropout(.1)(dense_layer)
predict_layer = Dense(LABEL_NUM, activation='softmax')(drop_layer2)

model=Model(seq_input, predict_layer)
model.compile(optimizer=keras.optimizers.Adam(lr=0.002),
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[keras.metrics.SparseCategoricalAccuracy(name="accuracy")])
model.summary()
cp = ModelCheckpoint('model/model_cnn_test.hdf5',monitor='val_accuracy',verbose=1,save_best_only=True)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1600)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1600, 100)         6117500   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1596, 256)         128256    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 797, 256)          0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 797, 256)          3188      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 793, 256)          327936    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 396, 256)          0   

In [30]:
y_train

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

In [24]:
history = model.fit(x_train, y_train, epochs=15, validation_data=(x_test, y_test), batch_size=1, callbacks=[cp])

Epoch 1/15


ValueError: in user code:

    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\engine\training.py:533 train_step  **
        y, y_pred, sample_weight, regularization_losses=self.losses)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:205 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\losses.py:143 __call__
        losses = self.call(y_true, y_pred)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\losses.py:1558 sparse_categorical_crossentropy
        y_true, y_pred, from_logits=from_logits, axis=axis)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\keras\backend.py:4655 sparse_categorical_crossentropy
        labels=target, logits=output)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\ops\nn_ops.py:3591 sparse_softmax_cross_entropy_with_logits_v2
        labels=labels, logits=logits, name=name)
    C:\Users\pb580\anaconda3\envs\ML_Env\lib\site-packages\tensorflow\python\ops\nn_ops.py:3507 sparse_softmax_cross_entropy_with_logits
        logits.get_shape()))

    ValueError: Shape mismatch: The shape of labels (received (3,)) should equal the shape of logits except for the last dimension (received (1, 3)).


In [None]:
h_accuracy = history.history['accuracy']
h_val_accuracy = history.history['val_accuracy']
h_loss = history.history['loss']
h_val_loss = history.history['val_loss']

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(y=h_accuracy, mode='lines+markers', name='accuracy', line=dict(color='skyblue')),
              row=1, col=1)
fig.add_trace(go.Scatter(y=h_val_accuracy, mode='lines+markers', name='validation accuracy', line=dict(color='dodgerblue')),
              row=1, col=1)

fig.add_trace(go.Scatter(y=h_loss, mode='lines+markers',name='loss', line=dict(color='lightsalmon')),
              row=1, col=2)
fig.add_trace(go.Scatter(y=h_val_loss, mode='lines+markers', name='validation loss', line=dict(color='tomato')),
              row=1, col=2)

fig.update_xaxes(title_text='Epochs', row=1, col=1)
fig.update_xaxes(title_text='Epochs', row=1, col=2)
fig.update_yaxes(title_text='Accuracy', row=1, col=1)
fig.update_yaxes(title_text='Loss', row=1, col=2)

fig.update_layout(title='Model Performation', height=480, width=1080)

## Data prediction
I don't know why the test data on kaggle is different with the one on google drive

In [None]:
testdf = pd.read_csv('data/test.csv')
testdf.head()

In [None]:
len(testdf)

In [None]:
test = testdf['review'].tolist()
test_seq = tokenizer.texts_to_sequences(test)
test_seq = pad_sequences(test_seq, padding='post', maxlen=WORD_SEQUENCE_LENGTH)

In [None]:
pred = model.predict(test_seq)

In [None]:
pred

In [None]:
classes = np.argmax(pred, axis=1)
classes = classes + 1
submission = testdf.drop('review', axis=1)
submission['rating']=classes
submission.head()

In [None]:
print('===========Description===========\n', submission.describe(), '\n')
print('rating 1: ', submission[submission['rating'] == 1].rating.count())
print('rating 2: ', submission[submission['rating'] == 2].rating.count())
print('rating 3: ', submission[submission['rating'] == 3].rating.count())
print('rating 4: ', submission[submission['rating'] == 4].rating.count())
print('rating 5: ', submission[submission['rating'] == 5].rating.count())

In [None]:
submission.to_csv('submission/submission_00.csv', index=False)