In [1]:
# Check if GPU is detected

import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# navigate to the zipped training file

In [5]:
cd /content/gdrive/My Drive/Lesson-9/

/content/gdrive/My Drive/Lesson-9


In [6]:
pwd

'/content/gdrive/My Drive/Lesson-9'

In [10]:
!unzip data.csv.zip

Archive:  data.csv.zip
  inflating: data.csv                
   creating: __MACOSX/
  inflating: __MACOSX/._data.csv     


Modelling

In [3]:
import os
import re
import pickle
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

Using TensorFlow backend.


In [0]:
# read and process data

In [0]:
def preprocess_data(data_file_path):
    data = pd.read_csv(data_file_path, header=None) # read the csv
    data.columns = ['rating', 'title', 'review'] # add column names
    data['review'] = data['review'].apply(lambda x: x.lower()) # change all text to lower
    data['review'] = data['review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) # remove all numbers
    return data

In [0]:
df = preprocess_data('data.csv')

In [9]:
df.head()

Unnamed: 0,rating,title,review
0,3,more like funchuck,gave this to my dad for a gag gift after direc...
1,5,Inspiring,i hope a lot of people hear this cd we need mo...
2,5,The best soundtrack ever to anything.,im reading a lot of reviews saying that this i...
3,4,Chrono Cross OST,the music of yasunori misuda is without questi...
4,5,Too good to be true,probably the greatest soundtrack in history us...


In [0]:
# initialize tokenization

max_features = 2000
maxlength = 250

tokenizer = Tokenizer(num_words=max_features, split=' ')

In [0]:
# fit tokenizer

tokenizer.fit_on_texts(df['review'].values)
X = tokenizer.texts_to_sequences(df['review'].values)

# pad sequences
X = pad_sequences(X, maxlen=maxlength)

In [23]:
X.shape

(3000000, 250)

In [0]:
# get target variable

y_train = pd.get_dummies(df.rating).values

In [24]:
y_train.shape

(3000000, 5)

In [0]:
# model

In [15]:
embed_dim = 128
hidden_units = 100
n_classes = 5

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(LSTM(hidden_units))
model.add(Dense(n_classes, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 128)          256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               91600     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 348,105
Trainable params: 348,105
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
# fit the model

model.fit(X[:100000, :], y_train[:100000, :], batch_size = 128, epochs=15, validation_split=0.2)

Train on 80000 samples, validate on 20000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fcba53a00f0>

In [0]:
# save model and tokenizer

model.save('trained_model.h5')  # creates a HDF5 file 'trained_model.h5'

with open('trained_tokenizer.pkl', 'wb') as f: # creates a pickle file 'trained_tokenizer.pkl'
    pickle.dump(tokenizer, f)

In [0]:
from google.colab import files
files.download('trained_model.h5')
files.download('trained_tokenizer.pkl')