# Data Analysis

## Import packages

In [19]:
import os
os.environ['KERAS_BACKEND']='theano'

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
plt.switch_backend('agg')

from sklearn.model_selection import train_test_split

import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

## Define constants

In [2]:
MAX_VOCABULARY_NUM = 200000
WORD_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
LABEL_NUM = 5

## Read data

In [3]:
_df = pd.read_csv('data/train.csv')
_df.sample(10)

Unnamed: 0,review_id,review,rating
136916,136916,The product quality is excellent. The origina...,5
118543,118543,Product has been successfully Pack of well The...,5
23284,23284,Request a pink tissue d love blue,2
52737,52737,"It floated its great length, drifted a little....",3
69277,69277,God bless you really like it though but not lo...,4
25599,25599,"Delivery is slow, but some fabrics wear well.",2
69990,69990,The product quality is excellent. The product...,4
34991,34991,1. Good product quality 2. Cheaper than any ot...,3
97658,97658,Goods has arrived. Beautiful color. Value for ...,4
27032,27032,Product quality standards. The price of the s...,2


In [4]:
df = _df.copy()

## Clean data initially

In [5]:
df.drop('review_id', axis=1, inplace=True)
df['review'] = df['review'].str.strip().str.lower()
df.sample(5)

Unnamed: 0,review,rating
117155,thank you..super love it.. excellent quality v...,5
139167,products as needed fast delivery schedule cour...,5
54741,delivery cepat..produk ok,3
108266,awesome speed of the ship awesome speed of the...,5
121437,wrong size sacrifice. bored!,5


In [6]:
# rating to index
df['rating'] = df['rating'] - 1
df[df['rating'] == 0].sample(10)

Unnamed: 0,review,rating
13340,too little kuantitinya.tidak worth the price. ...,0
9438,bad bad bad cp value of goods does not match t...,0
12537,the price was not in favor of it. let's go buy...,0
1211,goods not received. 3pc message only sent two....,0
4412,yg wooooy bener kalo jualan fork same order sc...,0
11627,not always in accordance with the stock. admin...,0
5677,unsuitable no,0
3802,the light button does not function....,0
9647,brgnya unused fitting krn,0
7831,"its products 4. 2 for sends. buy 22, he sent a...",0


## Transform data and labels into machine-recognizable data

In [7]:
texts = list(df['review'])
labels = list(df['rating'])
print(texts[:5])
print(labels[:5])

['ga disappointed neat products .. meletot hilsnyaa speed \u200b\u200bof delivery is good.', 'rdtanya replace broken glass, broken chargernya', 'nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma the change ma pictures that the face.', 'sent a light blue suit goods ga want a refund', 'pendants came with dents and scratches on its surface. the coating looks like it will change colour quickly.']
[0, 0, 0, 0, 0]


In [8]:
tokenizer = Tokenizer(num_words=MAX_VOCABULARY_NUM)
tokenizer.fit_on_texts(texts)
word_sequences = tokenizer.texts_to_sequences(texts)

word_id_dict = tokenizer.word_index
print('Total vocabulary numbers: ', len(word_id_dict))

Total vocabulary numbers:  72284


In [9]:
texts[:10]

['ga disappointed neat products .. meletot hilsnyaa speed \u200b\u200bof delivery is good.',
 'rdtanya replace broken glass, broken chargernya',
 'nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma the change ma pictures that the face.',
 'sent a light blue suit goods ga want a refund',
 'pendants came with dents and scratches on its surface. the coating looks like it will change colour quickly.',
 'dg yg depending being sent in photos',
 'hours not a hologram',
 'shop fraudulent business. we put two lamps, one shoe, one nozzle, one wallet, one of gardening mini, 2 bags of fragrant. the total amount including the 144.000d ship. after receipt of goods received only one shoe, one garden mini, 1 straw. still collect enough money but enough goods k. deceptive

In [10]:
word_sequences = pad_sequences(word_sequences, maxlen=WORD_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
labels

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

## Split train-test data

In [11]:
x_train, x_test, y_train, y_test = train_test_split(word_sequences, labels, test_size=0.2, shuffle=True)
print(len(x_train))
print(len(x_test))

117448
29363


## Construct the embedding layer

In [12]:
embedding_dict = {}

with open('data/lib/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

f.close()

In [13]:
embedding_matrix = np.zeros((len(word_id_dict) + 1, EMBEDDING_DIM))

for word, i in word_id_dict.items():
    embedding_vec = embedding_dict.get(word)
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec

In [14]:
embedding_layer = Embedding(input_dim=len(word_id_dict)+1, 
                           output_dim=EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAX_VOCABULARY_NUM, 
                           trainable=True)

## Construct CNN model

In [21]:
seq_input = Input(shape=(WORD_SEQUENCE_LENGTH,), dtype='int32')
embedding_seq = embedding_layer(seq_input)
conv_layer1 = Conv1D(128, 5, activation='relu')(embedding_seq)
pool_layer1 = MaxPooling1D(5)(conv_layer1)
conv_layer2 = Conv1D(128, 5, activation='relu')(pool_layer1)
pool_layer2 = MaxPooling1D(5)(conv_layer2)
conv_layer3 = Conv1D(128, 5, activation='relu')(pool_layer2)
pool_layer3 = MaxPooling1D(35)(conv_layer3)
# conv_layer4 = Conv1D(128, 8, activation='relu')(pool_layer3)
# pool_layer4 = MaxPooling1D(26)(conv_layer4)
flatten_layer = Flatten()(pool_layer3)
dense_layer = Dense(128, activation='relu')(flatten_layer)
tf.random.set_seed(0)
drop_layer1 = Dropout(.2)(dense_layer)
predict_layer = Dense(LABEL_NUM, activation='softmax')(drop_layer1)

model=Model(seq_input, predict_layer)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()
cp = ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 100)         7228500   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 199, 128)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 39, 128)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 35, 128)           8204

In [None]:
history = model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test), batch_size=2, callbacks=[cp])

Epoch 1/5
Epoch 00001: val_acc improved from -inf to 0.26101, saving model to model_cnn.hdf5
Epoch 2/5
 6275/58724 [==>...........................] - ETA: 1:00:15 - loss: 1.5583 - acc: 0.3067