# Data Analysis

## Import packages

In [74]:
import os
os.environ['KERAS_BACKEND']='theano'

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
plt.switch_backend('agg')

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

## Define constants

In [39]:
MAX_VOCABULARY_NUM = 200000
WORD_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
LABEL_NUM = 5

## Read data

In [82]:
_df = pd.read_csv('data/train.csv')
_df.sample(10)

Unnamed: 0,review_id,review,rating
57140,57140,You gave me the wrong set of pens instead of P...,3
91833,91833,"Praise has come ... paid results to plant, alt...",4
133399,133399,Awesome speed of the ship awesome awesome qua...,5
105723,105723,Recomended really .... 😍 Bakal subscription jdi,5
15022,15022,The product packs bags Fast delivery Quality ...,2
23664,23664,Product quality is ok Service by seller is ok,2
135281,135281,Very good value Product quality is very good,5
105994,105994,Was hoping it would be tempered glass but it w...,5
101620,101620,Legit...thank you seller...will order again,4
139788,139788,"Product quality fabrics, thick very cheap, jus...",5


In [83]:
df = _df.copy()

## Clean data initially

In [84]:
df.drop('review_id', axis=1, inplace=True)
df['review'] = df['review'].str.strip().str.lower()
df.sample(5)

Unnamed: 0,review,rating
19433,thin nerawang,2
99574,awesome awesome speed of delivery of goods qua...,4
84861,box slightly dented pen also distorted.,4
6915,not sesuai..bisa i return?,1
104960,thank you! i received the item in good conditi...,4


In [85]:
# rating to index
df['rating'] = df['rating'] - 1
df[df['rating'] == 0].sample(10)

Unnamed: 0,review,rating
2680,speed ​​of delivery is the product quality is ...,0
4146,frustrating. no belt. i have also just improvi...,0
8400,white pinadeliver order them black. sna had be...,0
943,bad bad value cp cp value of the poor quality ...,0
4254,"nyesel buy, the price is a little expensive co...",0
13660,missing many buildings to be erected in the fo...,0
3366,delivery is very long and bertele2 9 days rece...,0
4930,i order sup game box in 5 colour which is ever...,0
1489,last for about 10 minutes.,0
1667,d kljnn jdbqshichqs kqhdjhwqf . dnjhcbh c bdsh...,0


## Transform data and labels into machine-recognizable data

In [86]:
texts = list(df['review'])
labels = list(df['rating'])
print(texts[:5])
print(labels[:5])

['ga disappointed neat products .. meletot hilsnyaa speed \u200b\u200bof delivery is good.', 'rdtanya replace broken glass, broken chargernya', 'nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma the change ma pictures that the face.', 'sent a light blue suit goods ga want a refund', 'pendants came with dents and scratches on its surface. the coating looks like it will change colour quickly.']
[0, 0, 0, 0, 0]


In [89]:
tokenizer = Tokenizer(num_words=MAX_VOCABULARY_NUM)
tokenizer.fit_on_texts(texts)
word_sequences = tokenizer.texts_to_sequences(texts)

word_id_dict = tokenizer.word_index
print('Total vocabulary numbers: ', len(word_id_dict))

Total vocabulary numbers:  72284


In [90]:
word_sequences = pad_sequences(word_sequences, maxlen=WORD_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
labels

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

## Split train-test data

In [91]:
x_train, x_test, y_train, y_test = train_test_split(word_sequences, labels, test_size=0.2, shuffle=True)
print(len(x_train))
print(len(x_test))

117448
29363


## Construct the embedding layer

In [93]:
embedding_dict = {}

with open('data/lib/glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = coefs

f.close

<function TextIOWrapper.close()>

In [94]:
embedding_matrix = np.zeros((len(word_id_dict) + 1, EMBEDDING_DIM))

for word, i in word_id_dict.items():
    embedding_vec = embedding_dict.get(word)
    if embedding_vec is not None:
        embedding_matrix[i] = embedding_vec

In [95]:
embedding_layer = Embedding(input_dim=len(word_id_dict)+1, 
                           output_dim=EMBEDDING_DIM, 
                           weights=[embedding_matrix], 
                           input_length=MAX_VOCABULARY_NUM, 
                           trainable=True)

## Construct CNN model

In [96]:
seq_input = Input(shape=(WORD_SEQUENCE_LENGTH,), dtype='int32')
embedding_seq = embedding_layer(seq_input)
conv_layer1 = Conv1D(128, 8, activation='relu')(embedding_seq)
pool_layer1 = MaxPooling1D(3)(conv_layer1)
conv_layer2 = Conv1D(128, 8, activation='relu')(pool_layer1)
pool_layer2 = MaxPooling1D(3)(conv_layer2)
conv_layer3 = Conv1D(128, 8, activation='relu')(pool_layer2)
pool_layer3 = MaxPooling1D(3)(conv_layer3)
conv_layer4 = Conv1D(128, 8, activation='relu')(pool_layer3)
pool_layer4 = MaxPooling1D(26)(conv_layer4)
flatten_layer = Flatten()(pool_layer3)
dense_layer = Dense(128, activation='relu')(flatten_layer)
predict_layer = Dense(LABEL_NUM, activation='softmax')(dense_layer)

model=Model(seq_input, predict_layer)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()
cp = ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, 1000)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1000, 100)         7228500   
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 993, 128)          102528    
_________________________________________________________________
max_pooling1d_41 (MaxPooling (None, 331, 128)          0         
_________________________________________________________________
conv1d_43 (Conv1D)           (None, 324, 128)          131200    
_________________________________________________________________
max_pooling1d_42 (MaxPooling (None, 108, 128)          0         
_________________________________________________________________
conv1d_44 (Conv1D)           (None, 101, 128)          131

In [97]:
y_train

array([[0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [98]:
history = model.fit(x_train, y_train, epochs=15, validation_data=(x_test, y_test), batch_size=2, callbacks=[cp])

Epoch 1/15
 8383/58724 [===>..........................] - ETA: 1:28:33 - loss: 1.5174 - acc: 0.2842

KeyboardInterrupt: 