# Data Analysis

## Import packages

In [90]:
import os

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.utils import plot_model
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

## Define constants

In [3]:
MAX_VOCABULARY_NUM = 200000
WORD_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
LABEL_NUM = 5

## Read data

In [4]:
_df = pd.read_csv('data/train.csv')
_df.sample(10)

Unnamed: 0,review_id,review,rating
89972,89972,In production from 2016 to 2019. OLD BOX - PAP...,4
25785,25785,Mesh and sutures lepas😢,2
15755,15755,Cain lovely but small size .. for which I buy ...,2
80347,80347,Wonderful lady boss! Fast response and goods r...,4
121111,121111,Direct dipakee ni sand and can only be made us...,5
74364,74364,Good product quality Fast seller response Fast...,4
135580,135580,"Good, to order .. Always satisfying shopping i...",5
41644,41644,Although cheap .... luxury tpi,3
5108,5108,Product quality is poor,1
94175,94175,Pungent smell,4


In [5]:
df = _df.copy()

## Clean data initially

In [6]:
df.drop('review_id', axis=1, inplace=True)
df['review'] = df['review'].str.strip().str.lower()
df.sample(5)

Unnamed: 0,review,rating
6794,children shovel,1
31368,"seller friendly, hrga trjngkau..cm same thin m...",3
94018,successfully reloaded. tq !,4
9841,items same picture the fabric was loose in the...,1
133042,"the original product, sticker cutting hargq ch...",5


In [7]:
# replace the unicode space into space
df['review'] = df['review'].str.replace(u'\u200b', ' ')

In [8]:
# rating to index
df['rating'] = df['rating'] - 1
df[df['rating'] == 0].sample(10)

Unnamed: 0,review,rating
10683,"set of 7 no goods shop is right, why take writ...",0
7452,it's not a full glue 🙄,0
5410,it merits sew the zipper pull is not distorted...,0
6284,"delivery ,, old, morbidly knp through j & t aj...",0
14023,"i've often order these stores, fast delivery b...",0
6158,"not working,",0
10311,"fabric hot, thin, being ruffled.",0
4795,"fast delivery, good enough but turned out to b...",0
2436,"good materials appropriate pricing, thin pocke...",0
12504,now why some repack yes when orders that no re...,0


## Transform data and labels into machine-recognizable data

In [9]:
texts = df['review'].tolist()
labels = df['rating'].tolist()
print(texts[:5])
print(labels[:5])

['ga disappointed neat products .. meletot hilsnyaa speed   of delivery is good.', 'rdtanya replace broken glass, broken chargernya', 'nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma the change ma pictures that the face.', 'sent a light blue suit goods ga want a refund', 'pendants came with dents and scratches on its surface. the coating looks like it will change colour quickly.']
[0, 0, 0, 0, 0]


In [10]:
text_arr = np.array(texts)
label_arr = np.array(labels)

In [11]:
# 隨機挑四萬筆出來訓練
random_index = np.random.choice([i for i in range(len(df))], 40000, replace=False)
train_texts = text_arr[random_index]
train_labels = label_arr[random_index]

In [12]:
train_texts = train_texts.tolist()
train_labels = train_labels.tolist()

In [13]:
tokenizer = Tokenizer(num_words=MAX_VOCABULARY_NUM)
tokenizer.fit_on_texts(train_texts)
word_sequences = tokenizer.texts_to_sequences(train_texts)

word_id_dict = tokenizer.word_index
print('Total vocabulary numbers: ', len(word_id_dict))

Total vocabulary numbers:  31378


In [14]:
train_texts[:10]

['pants belly than his rather wide, unlike the sample images. shop full delivery, packaging certainly',
 'with rubbing wear nakakhilo',
 'the quality of the product has good charts charger to be watched again that it will last for overall okay, good transportation service from ceaa.',
 'awesome speed of the ship awesome awesome quality merchandise value cp',
 'super pretty. shop also get free gifts. will support long-term shop. product packaging to make sure',
 'the product quality is not good. the product quality is not good. the product quality is not good.',
 'i have received the goods in accordance with the order, thanks',
 'delivery took longer time than we expected. delivery by ninjavan was good. both tables were good and stable. happy with the purchase. only the white table had some brown stains, as shown in the attached picture.',
 'thick material. am very satisfied. jd deserve subscriptions',
 'poor delivery speed seller unfavorable response']

In [15]:
# padding sequences by post method
word_sequences = pad_sequences(word_sequences, padding='post', maxlen=WORD_SEQUENCE_LENGTH)

train_labels = to_categorical(np.asarray(train_labels))
train_labels

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)

## Split train-test data

In [16]:
x_train, x_test, y_train, y_test = train_test_split(word_sequences, train_labels, test_size=0.2)
print(len(x_train))
print(len(x_test))

32000
8000


## Construct the embedding layer

In [17]:
embedding_dict = {}

with open('data/lib/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

f.close()

In [18]:
embedding_matrix = np.zeros((len(word_id_dict) + 1, EMBEDDING_DIM))

for word, i in word_id_dict.items():
    embedding_vec = embedding_dict.get(word)
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec

In [19]:
embedding_layer = Embedding(input_dim=len(word_id_dict)+1, 
                           output_dim=EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAX_VOCABULARY_NUM, 
                           trainable=False)

## Construct CNN model

In [23]:
# increase the filters amount and layers amount
# use adam optimizer(not yet)
# change the activation function of classify layer to sigmoid function(not yet)
seq_input = Input(shape=(WORD_SEQUENCE_LENGTH,), dtype='int32')
embedding_seq = embedding_layer(seq_input)

conv_layer1 = Conv1D(256, 5, activation='relu')(embedding_seq)
pool_layer1 = MaxPooling1D(3)(conv_layer1)

conv_layer2 = Conv1D(256, 5, activation='relu', padding='same')(pool_layer1)
pool_layer2 = MaxPooling1D(3)(conv_layer2)

conv_layer3 = Conv1D(256, 5, activation='relu', padding='same')(pool_layer2)
pool_layer3 = MaxPooling1D(3)(conv_layer3)

conv_layer4 = Conv1D(256, 5, activation='relu', padding='same')(pool_layer3)
pool_layer4 = GlobalMaxPooling1D()(conv_layer4)
drop_layer1 = Dropout(.1)(pool_layer4)

flatten_layer = Flatten()(drop_layer1)
dense_layer = Dense(256, activation='relu')(flatten_layer)
drop_layer2 = Dropout(.1)(dense_layer)

predict_layer = Dense(LABEL_NUM, activation='softmax')(drop_layer2)

model=Model(seq_input, predict_layer)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()
cp = ModelCheckpoint('model/model_cnn_.hdf5',monitor='val_accuracy',verbose=1,save_best_only=True)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 100)         3137900   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 996, 256)          128256    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 332, 256)          0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 328, 256)          327936    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 109, 256)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 105, 256)          3279

In [24]:
history = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test), batch_size=1, callbacks=[cp])

Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.41275, saving model to model/model_cnn_.hdf5
Epoch 2/20
Epoch 00002: val_accuracy improved from 0.41275 to 0.42375, saving model to model/model_cnn_.hdf5
Epoch 3/20
Epoch 00003: val_accuracy did not improve from 0.42375
Epoch 4/20
Epoch 00004: val_accuracy did not improve from 0.42375
Epoch 5/20
Epoch 00005: val_accuracy did not improve from 0.42375
Epoch 6/20
Epoch 00006: val_accuracy did not improve from 0.42375
Epoch 7/20
Epoch 00007: val_accuracy did not improve from 0.42375
Epoch 8/20
Epoch 00008: val_accuracy did not improve from 0.42375
Epoch 9/20
Epoch 00009: val_accuracy did not improve from 0.42375
Epoch 10/20
Epoch 00010: val_accuracy improved from 0.42375 to 0.42463, saving model to model/model_cnn_.hdf5
Epoch 11/20
Epoch 00011: val_accuracy did not improve from 0.42463
Epoch 12/20
Epoch 00012: val_accuracy did not improve from 0.42463
Epoch 13/20
Epoch 00013: val_accuracy did not improve from 0.42463
Epoch 14/20


In [89]:
h_accuracy = history.history['accuracy']
h_val_accuracy = history.history['val_accuracy']
h_loss = history.history['loss']
h_val_loss = history.history['val_loss']

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(y=h_accuracy, mode='lines+markers', name='accuracy', line=dict(color='skyblue')),
              row=1, col=1)
fig.add_trace(go.Scatter(y=h_val_accuracy, mode='lines+markers', name='validation accuracy', line=dict(color='dodgerblue')),
              row=1, col=1)

fig.add_trace(go.Scatter(y=h_loss, mode='lines+markers',name='loss', line=dict(color='lightsalmon')),
              row=1, col=2)
fig.add_trace(go.Scatter(y=h_val_loss, mode='lines+markers', name='validation loss', line=dict(color='tomato')),
              row=1, col=2)

fig.update_xaxes(title_text='Epochs', row=1, col=1)
fig.update_xaxes(title_text='Epochs', row=1, col=2)
fig.update_yaxes(title_text='Accuracy', row=1, col=1)
fig.update_yaxes(title_text='Loss', row=1, col=2)

fig.update_layout(title='Model Performation', height=480, width=1080)

## Data prediction
I don't know why the test data on kaggle is different with the one on google drive

In [114]:
testdf = pd.read_csv('data/kaggle/test.csv')
testdf.head()

Unnamed: 0,review_id,review
0,1,"Great danger, cool, motif and cantik2 jg model..."
1,2,One of the shades don't fit well
2,3,Very comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...
4,5,it's sooooo cute! i like playing with the glit...


In [115]:
len(testdf)

60427

In [116]:
test = testdf['review'].tolist()
test_seq = tokenizer.texts_to_sequences(test)
test_seq = pad_sequences(test_seq, padding='post', maxlen=WORD_SEQUENCE_LENGTH)

In [117]:
pred = model.predict(test_seq)

In [118]:
pred

array([[2.9733109e-03, 6.7029811e-02, 8.3005583e-01, 6.2849827e-02,
        3.7091237e-02],
       [1.8666623e-02, 6.8406999e-02, 4.5586318e-01, 2.2512704e-01,
        2.3193623e-01],
       [1.7301171e-04, 1.5483038e-03, 7.0666239e-02, 4.4260991e-01,
        4.8500249e-01],
       ...,
       [5.0930335e-06, 1.0721830e-04, 1.8230438e-02, 4.7407785e-01,
        5.0757945e-01],
       [3.1886199e-03, 1.4256435e-02, 2.1988235e-01, 3.5712633e-01,
        4.0554625e-01],
       [1.1344216e-02, 4.6329729e-02, 4.0691611e-01, 2.6126617e-01,
        2.7414382e-01]], dtype=float32)

In [119]:
classes = np.argmax(pred, axis=1)
classes = classes + 1
submission = testdf.drop('review', axis=1)
submission['rating']=classes
submission.head()

Unnamed: 0,review_id,rating
0,1,3
1,2,3
2,3,5
3,4,3
4,5,5


In [120]:
print('===========Description===========\n', submission.describe(), '\n')
print('rating 1: ', submission[submission['rating'] == 1].rating.count())
print('rating 2: ', submission[submission['rating'] == 2].rating.count())
print('rating 3: ', submission[submission['rating'] == 3].rating.count())
print('rating 4: ', submission[submission['rating'] == 4].rating.count())
print('rating 5: ', submission[submission['rating'] == 5].rating.count())

           review_id        rating
count  60427.000000  60427.000000
mean   30214.000000      3.739554
std    17443.916695      1.250976
min        1.000000      1.000000
25%    15107.500000      3.000000
50%    30214.000000      3.000000
75%    45320.500000      5.000000
max    60427.000000      5.000000 

rating 1:  4535
rating 2:  728
rating 3:  27806
rating 4:  229
rating 5:  27129


In [121]:
submission.to_csv('submission/submission_00.csv', index=False)