# Data Analysis

## Import packages

In [1]:
import os
import re
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras

from keras.utils.np_utils import to_categorical
from keras.utils import plot_model
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint

  return f(*args, **kwds)
  return f(*args, **kwds)


## Define constants

In [2]:
INPUT_SEQUENCE_LENGTH = 15
EMBEDING_DIM = 20
LABEL_NUM = 2

## Read data

In [3]:
src = 'data/train.csv'

In [4]:
_train = pd.read_csv(src)
_train.sample(10)

Unnamed: 0,country_code,grass_date,user_id,subject_line_length,last_open_day,last_login_day,last_checkout_day,open_count_last_10_days,open_count_last_30_days,open_count_last_60_days,login_count_last_10_days,login_count_last_30_days,login_count_last_60_days,checkout_count_last_10_days,checkout_count_last_30_days,checkout_count_last_60_days,open_flag,row_id
35471,3,2019-08-08 00:00:00+08:00,78301,52,Never open,1,1,0,0,0,11,33,72,3,5,8,1,35471
45593,1,2019-08-16 00:00:00+08:00,42464,47,115,4,6,0,0,0,68,121,220,1,3,5,0,45593
72737,2,2019-09-02 00:00:00+08:00,103134,38,12,129,129,0,1,3,0,0,0,0,0,0,0,72737
39634,1,2019-08-12 00:00:00+08:00,9922,30,48,2,10,0,0,1,8,14,20,1,2,3,0,39634
26078,2,2019-08-03 00:00:00+08:00,76124,34,1,5,38,5,14,29,3,9,27,0,0,1,1,26078
22328,1,2019-07-30 00:00:00+08:00,79392,60,66,2,3,0,0,0,36,77,94,15,24,29,0,22328
56906,5,2019-08-23 00:00:00+08:00,40116,20,2,1,4,3,11,17,10,10,13,1,1,1,1,56906
3278,3,2019-07-17 00:00:00+08:00,70926,30,Never open,7,15,0,0,0,9,28,34,0,6,7,0,3278
70205,5,2019-09-02 00:00:00+08:00,13393,23,8,14,20,2,4,13,0,10,18,0,1,2,1,70205
7842,3,2019-07-21 00:00:00+08:00,102038,50,23,17,23,0,1,2,0,7,16,0,1,5,0,7842


In [5]:
train = _train.copy()

## Transform data and labels into machine-recognizable data

In [6]:
columns = list(train.drop(['user_id', 'open_flag', 'row_id'], axis=1, inplace=False).columns)

In [7]:
train = train.replace('Never open', -1).replace('Never login', -1).replace('Never checkout', -1)

In [8]:
def to_timestamp(datetime):
    return datetime.timestamp()

In [9]:
# transform datetime to unit(s)
train['grass_date'] = pd.to_datetime(train['grass_date'])
train['grass_date'] = train['grass_date'].apply(to_timestamp)
train['grass_date']

0        1.563206e+09
1        1.563206e+09
2        1.563206e+09
3        1.563206e+09
4        1.563206e+09
             ...     
73534    1.567354e+09
73535    1.567354e+09
73536    1.567354e+09
73537    1.567354e+09
73538    1.567354e+09
Name: grass_date, Length: 73539, dtype: float64

In [12]:
_inputs = list(list(train.loc[i, columns]) for i in tqdm(range(len(train))))
std_scaler = StandardScaler()
std_scaler.fit(_inputs)
_inputs = std_scaler.transform(_inputs)
_labels = train['open_flag'].tolist()
print(_inputs[:5])
print(_labels[:5])

100%|██████████| 73539/73539 [00:47<00:00, 1548.24it/s]


[[ 0.87560055 -1.68865218  0.03062836 -0.19343485 -0.13677297 -0.32661951
  -0.51984502 -0.17077048 -0.12899651  0.26948899  0.46802047  0.69605094
  -0.31512753  0.28674948  0.32754016]
 [ 0.87560055 -1.68865218  0.03062836 -0.3811885  -0.1425771  -0.41148591
   0.62037966  1.36088847  1.51610264  0.71643819  0.60011063  0.57073563
   0.03120522 -0.23367649 -0.10164174]
 [ 2.06945961 -1.68865218  0.47629107 -0.28731167 -0.13967503 -0.43694583
  -0.51984502  0.26684636  0.8833722   1.16338738  1.15488933  0.9745294
   1.41653622  2.10824036  1.54355553]
 [-0.91518805 -1.68865218  0.47629107  0.36982611 -0.12806678 -0.02958709
  -0.51984502 -0.60838733 -0.50863478  0.04601439 -0.0603402   0.27833324
   0.03120522  0.02653649  0.04141889]
 [ 2.06945961 -1.68865218  0.47629107  3.7118411  -0.13677297  1.39616852
  -0.51984502 -0.60838733 -0.63518087 -0.47542634 -0.5358648  -0.61279785
  -0.31512753 -0.36378298 -0.387763  ]]
[0, 1, 0, 0, 0]


In [13]:
inputs = np.asarray(_inputs, dtype='int32')
_labels = np.asarray(_labels, dtype='int32')
labels = to_categorical(_labels)

## Random partial data selection
<span style="color:#FF0000"><i class="fa fa-exclamation-circle"></i>
 To fit the whole data, please skip this cell</span>

## Split train-test data

In [14]:
x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2)
print(len(x_train))
print(len(x_test))

58831
14708


## Construct DNN model

In [15]:
model = keras.models.Sequential()
model.add(Dense(units=16, input_dim=15, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[keras.metrics.BinaryAccuracy(name='accuracy')])
model.summary()
cp = ModelCheckpoint('model/model_dnn_test.hdf5',monitor='val_accuracy',verbose=1,save_best_only=True)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                256       
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 18        
Total params: 410
Trainable params: 410
Non-trainable params: 0
_________________________________________________________________


In [16]:
history = model.fit(x_train, y_train, epochs=100, validation_data=(x_test, y_test), callbacks=[cp])

Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.87565, saving model to model/model_dnn_test.hdf5
Epoch 2/100
Epoch 00002: val_accuracy did not improve from 0.87565
Epoch 3/100
Epoch 00003: val_accuracy did not improve from 0.87565
Epoch 4/100
Epoch 00004: val_accuracy improved from 0.87565 to 0.87571, saving model to model/model_dnn_test.hdf5
Epoch 5/100
Epoch 00005: val_accuracy improved from 0.87571 to 0.87639, saving model to model/model_dnn_test.hdf5
Epoch 6/100
Epoch 00006: val_accuracy did not improve from 0.87639
Epoch 7/100
Epoch 00007: val_accuracy did not improve from 0.87639
Epoch 8/100
Epoch 00008: val_accuracy improved from 0.87639 to 0.87680, saving model to model/model_dnn_test.hdf5
Epoch 9/100
Epoch 00009: val_accuracy did not improve from 0.87680
Epoch 10/100
Epoch 00010: val_accuracy improved from 0.87680 to 0.87721, saving model to model/model_dnn_test.hdf5
Epoch 11/100
Epoch 00011: val_accuracy did not improve from 0.87721
Epoch 12/100
Epoch 00012: val

Epoch 00028: val_accuracy did not improve from 0.87762
Epoch 29/100
Epoch 00029: val_accuracy did not improve from 0.87762
Epoch 30/100
Epoch 00030: val_accuracy did not improve from 0.87762
Epoch 31/100
Epoch 00031: val_accuracy improved from 0.87762 to 0.87803, saving model to model/model_dnn_test.hdf5
Epoch 32/100
Epoch 00032: val_accuracy did not improve from 0.87803
Epoch 33/100
Epoch 00033: val_accuracy did not improve from 0.87803
Epoch 34/100
Epoch 00034: val_accuracy did not improve from 0.87803
Epoch 35/100
Epoch 00035: val_accuracy did not improve from 0.87803
Epoch 36/100
Epoch 00036: val_accuracy did not improve from 0.87803
Epoch 37/100
Epoch 00037: val_accuracy did not improve from 0.87803
Epoch 38/100
Epoch 00038: val_accuracy did not improve from 0.87803
Epoch 39/100
Epoch 00039: val_accuracy did not improve from 0.87803
Epoch 40/100
Epoch 00040: val_accuracy did not improve from 0.87803
Epoch 41/100
Epoch 00041: val_accuracy did not improve from 0.87803
Epoch 42/100
E

KeyboardInterrupt: 

In [None]:
h_accuracy = history.history['accuracy']
h_val_accuracy = history.history['val_accuracy']
h_loss = history.history['loss']
h_val_loss = history.history['val_loss']

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(y=h_accuracy, mode='lines+markers', name='accuracy', line=dict(color='skyblue')),
              row=1, col=1)
fig.add_trace(go.Scatter(y=h_val_accuracy, mode='lines+markers', name='validation accuracy', line=dict(color='dodgerblue')),
              row=1, col=1)

fig.add_trace(go.Scatter(y=h_loss, mode='lines+markers',name='loss', line=dict(color='lightsalmon')),
              row=1, col=2)
fig.add_trace(go.Scatter(y=h_val_loss, mode='lines+markers', name='validation loss', line=dict(color='tomato')),
              row=1, col=2)

fig.update_xaxes(title_text='Epochs', row=1, col=1)
fig.update_xaxes(title_text='Epochs', row=1, col=2)
fig.update_yaxes(title_text='Accuracy', row=1, col=1)
fig.update_yaxes(title_text='Loss', row=1, col=2)

fig.update_layout(title='Model Performation', height=480, width=1080)

## Data prediction
I don't know why the test data on kaggle is different with the one on google drive

In [None]:
testdf = pd.read_csv('data/test.csv')
testdf.head()

In [None]:
len(testdf)

In [None]:
test = testdf['review'].tolist()
test_seq = tokenizer.texts_to_sequences(test)
test_seq = pad_sequences(test_seq, padding='post', maxlen=WORD_SEQUENCE_LENGTH)

In [None]:
pred = model.predict(test_seq)

In [None]:
pred

In [None]:
classes = np.argmax(pred, axis=1)
classes = classes + 1
submission = testdf.drop('review', axis=1)
submission['rating']=classes
submission.head()

In [None]:
print('===========Description===========\n', submission.describe(), '\n')
print('rating 1: ', submission[submission['rating'] == 1].rating.count())
print('rating 2: ', submission[submission['rating'] == 2].rating.count())
print('rating 3: ', submission[submission['rating'] == 3].rating.count())
print('rating 4: ', submission[submission['rating'] == 4].rating.count())
print('rating 5: ', submission[submission['rating'] == 5].rating.count())

In [None]:
submission.to_csv('submission/submission_00.csv', index=False)