# Data Analysis

## Import packages

In [1]:
import os

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.utils import plot_model
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Embedding, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint

## Define constants

In [2]:
MAX_VOCABULARY_NUM = 100000
WORD_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
LABEL_NUM = 5

## Read data

In [3]:
cleanedsrc = 'data/cleaned_data/'
src = 'data/train.csv'

In [4]:
_df = pd.read_csv(cleanedsrc+'trainbest.csv')
_df.sample(10)

Unnamed: 0,review_id,review,rating
70718,70718,yesterday afternoon set the next day i receive...,4
58128,58128,bubble wrapped but wrapping inside the box its...,3
20951,20951,less bit precision,2
90484,90484,great product,4
54263,54263,dvm kak orderr again tomorrow good stuff cheap...,3
98079,98079,sy recommend a good shop and fast response,4
43867,43867,damaged one card holder,3
104144,104144,good product quality good value for money fast...,4
142746,142746,good product thank god it did not disappoint a...,5
139361,139361,awesome speed of the ship awesome awesome qual...,5


In [18]:
df = _df.copy()
df.review = df.review.astype(str)

## Clean data initially

In [19]:
df.drop('review_id', axis=1, inplace=True)
df['review'] = df['review'].str.strip().str.lower()
df.sample(5)

Unnamed: 0,review,rating
14545,dituker guns could be because the word cs cott...,1
127945,items all packed well with plastic courier bag...,5
106319,good product good service is very good,5
100698,easy assembly life idiot can themselves thumbs...,4
111394,nice briefcase funny purple color just a littl...,5


In [20]:
# replace the unicode space into space
df['review'] = df['review'].str.replace(u'\u200b', ' ')

In [21]:
# rating to index
df['rating'] = df['rating'] - 1
df[df['rating'] == 0].sample(10)

Unnamed: 0,review,rating
14145,i ordered 10 pcs but the product i recieved is...,0
536,but do not send send bake cheese creamy2,0
9082,speed of delivery is good,0
11901,save a little love egk accordance with its ant...,0
6185,p until later klu sdh stuff until just love ra...,0
13292,,0
6291,delivery of goods for too long,0
5078,told guaranteed ori bottle dirty segel hoaks e...,0
3653,disharmonious desires,0
6616,invalid code na yung 1 star vouchers so just w...,0


## Transform data and labels into machine-recognizable data

In [22]:
texts = df['review'].tolist()
labels = df['rating'].tolist()
print(texts[:5])
print(labels[:5])

['ga disappointed neat products meletot hilsnyaa speed of delivery is good', 'rdtanya replace broken glass broken chargernya', 'nyesel bngt dsni shopping antecedent photo message pictures gk according fotodi existing collagen super fit nyampe holo my house open ehh collagen contents even in the face pdahal jg description super existing collagen writing my check lg in photo captions already ma the change ma pictures that the face', 'sent a light blue suit goods ga want a refund', 'pendants came with dents and scratches on its surface the coating looks like it will change colour quickly']
[0, 0, 0, 0, 0]


## Random partial data selection
<span style="color:#FF0000"><i class="fa fa-exclamation-circle"></i>
 To fit the whole data, please skip this cell</span>

In [10]:
text_arr = np.array(train_texts)
label_arr = np.array(train_labels)

# 隨機挑4萬筆出來訓練
random_index = np.random.choice([i for i in range(len(df))], 40000, replace=False)
train_texts = text_arr[random_index].tolist()
train_labels = label_arr[random_index].tolist()

NameError: name 'train_texts' is not defined

## Data processing
將資料處理成電腦能識別的資料型態

In [23]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_sequences = tokenizer.texts_to_sequences(texts)

word_id_dict = tokenizer.word_index
print('Total vocabulary numbers: ', len(word_id_dict))
seq_maxlen = max([len(ws) for ws in word_sequences])
print('Max sequence length: ', seq_maxlen)

Total vocabulary numbers:  51528
Max sequence length:  470


In [25]:
# padding sequences by post method
word_sequences = pad_sequences(word_sequences, padding='post', maxlen=400)
labels = to_categorical(np.asarray(labels))

## Split train-test data

In [26]:
x_train, x_test, y_train, y_test = train_test_split(word_sequences, labels, test_size=0.2)
print(len(x_train))
print(len(x_test))

117448
29363


## Construct the embedding layer

In [27]:
embedding_dict = {}

with open('data/lib/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

f.close()

In [28]:
unknown_words = []
embedding_matrix = np.zeros((len(word_id_dict) + 1, EMBEDDING_DIM))

for word, i in word_id_dict.items():
    embedding_vec = embedding_dict.get(word)
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec
    else:
        unknown_words.append(word)

print('There are totally %d unknown words in data.' % len(unknown_words))

There are totally 30371 unknown words in data.


## Customize our word vector

In [None]:
unknown_words

In [None]:
from gensim.models import Word2Vec
import multiprocessing

word2vec_model = Word2Vec(sentences=unknown_words, 
                 sg=1, 
                 size=100, 
                 min_count=1,
                 workers=multiprocessing.cpu_count())

In [None]:
len(word2vec_model.wv.vocab.keys())

可以看到只有1116個未知詞彙嵌入了個vector  
目前還在解決大量未知詞彙無法嵌入vector的問題，只好把這些未知詞彙跟著模型一起訓練

In [None]:
embedding_layer = Embedding(input_dim=len(word_id_dict)+1, 
                           output_dim=EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAX_VOCABULARY_NUM, 
                           trainable=True)

## Construct CNN model

In [None]:
# increase the filters amount and layers amount
# use adam optimizer
# change the activation function of classify layer to sigmoid function
seq_input = Input(shape=(400,), dtype='int32')
embedding_seq = embedding_layer(seq_input)

conv_layer1 = Conv1D(256, 5, activation='relu')(embedding_seq)
pool_layer1 = MaxPooling1D(pool_size=3, strides=2)(conv_layer1)
normal_layer = BatchNormalization(axis=1, epsilon=0.0001)(pool_layer1)

conv_layer2 = Conv1D(256, 5, activation='relu')(normal_layer)
pool_layer2 = MaxPooling1D(pool_size=3, strides=2)(conv_layer2)

conv_layer3 = Conv1D(256, 5, activation='relu')(pool_layer2)
pool_layer3 = MaxPooling1D(pool_size=3, strides=2)(conv_layer3)

conv_layer4 = Conv1D(256, 5, activation='relu')(pool_layer3)
pool_layer4 = MaxPooling1D(pool_size=3, strides=1)(conv_layer4)

conv_layer5 = Conv1D(256, 5, activation='relu')(pool_layer4)
gpool_layer = GlobalMaxPooling1D()(conv_layer5)

drop_layer1 = Dropout(.1)(gpool_layer)

flatten_layer = Flatten()(drop_layer1)
dense_layer = Dense(256, activation='relu')(flatten_layer)
drop_layer2 = Dropout(.1)(dense_layer)
predict_layer = Dense(LABEL_NUM, activation='softmax')(drop_layer2)

model=Model(seq_input, predict_layer)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()
cp = ModelCheckpoint('model/model_cnn_test.hdf5',monitor='val_accuracy',verbose=1,save_best_only=True)

In [None]:
history = model.fit(x_train, y_train, epochs=15, validation_data=(x_test, y_test), batch_size=1, callbacks=[cp])

In [None]:
h_accuracy = history.history['accuracy']
h_val_accuracy = history.history['val_accuracy']
h_loss = history.history['loss']
h_val_loss = history.history['val_loss']

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(y=h_accuracy, mode='lines+markers', name='accuracy', line=dict(color='skyblue')),
              row=1, col=1)
fig.add_trace(go.Scatter(y=h_val_accuracy, mode='lines+markers', name='validation accuracy', line=dict(color='dodgerblue')),
              row=1, col=1)

fig.add_trace(go.Scatter(y=h_loss, mode='lines+markers',name='loss', line=dict(color='lightsalmon')),
              row=1, col=2)
fig.add_trace(go.Scatter(y=h_val_loss, mode='lines+markers', name='validation loss', line=dict(color='tomato')),
              row=1, col=2)

fig.update_xaxes(title_text='Epochs', row=1, col=1)
fig.update_xaxes(title_text='Epochs', row=1, col=2)
fig.update_yaxes(title_text='Accuracy', row=1, col=1)
fig.update_yaxes(title_text='Loss', row=1, col=2)

fig.update_layout(title='Model Performation', height=480, width=1080)

## Data prediction
I don't know why the test data on kaggle is different with the one on google drive

In [None]:
testdf = pd.read_csv('data/test.csv')
testdf.head()

In [None]:
len(testdf)

In [None]:
test = testdf['review'].tolist()
test_seq = tokenizer.texts_to_sequences(test)
test_seq = pad_sequences(test_seq, padding='post', maxlen=WORD_SEQUENCE_LENGTH)

In [None]:
pred = model.predict(test_seq)

In [None]:
pred

In [None]:
classes = np.argmax(pred, axis=1)
classes = classes + 1
submission = testdf.drop('review', axis=1)
submission['rating']=classes
submission.head()

In [None]:
print('===========Description===========\n', submission.describe(), '\n')
print('rating 1: ', submission[submission['rating'] == 1].rating.count())
print('rating 2: ', submission[submission['rating'] == 2].rating.count())
print('rating 3: ', submission[submission['rating'] == 3].rating.count())
print('rating 4: ', submission[submission['rating'] == 4].rating.count())
print('rating 5: ', submission[submission['rating'] == 5].rating.count())

In [None]:
submission.to_csv('submission/submission_00.csv', index=False)