# TDT4173 Machine learning model training 

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
import tensorflow.keras.utils as ku 
import tensorflow as tf
import numpy as np 
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

First we check the number of lines in the file

## Loading the dataframe and balancing the data

We want equal amounts of reviews for all ratings, so we set the `cap_number` as the minimum number of reviews for one rating. (this is the number of revies rated 2 stars)


The process is done in a function for memory purposes.

In [2]:
def create_balanced_dataset(filename):
    df = pd.read_csv(filename, error_bad_lines=False, engine="python")
    
    unique, counts = np.unique(df["label"], return_counts=True)
    cap_number = min(counts)
    
    # Create one dataframe for reviews with each rating and sample `cap_number` rows for each.
    dfs = []
    for x in range(5):
        # x_df = df[df["label"]==x]
        number_of_rows = len(df[df["label"]==x].index)
        n = min(cap_number, number_of_rows)
        dfs.append(df[df["label"]==x].sample(n=n))  # Sample chooses random rows
        
    # Return the concatinated dataframes in randomised order
    return pd.concat(dfs).sample(frac=1.0)

In [3]:
balanced_dataframe = create_balanced_dataset("../Data/output_final2.csv")

Skipping line 268030: unexpected end of data


In [4]:
print(len(balanced_dataframe.index))

103710


## Training the LSTM

In [5]:
# Define filepaths
data_file_path = "output_final2.csv"
model_file_path = 'balanced_model_5epochs.h5'

# The maximum number of words to be used, only most frequent
vocabulary_size = 50000
# Max number of words in each review
max_review_size = 100

In [6]:
# df = pd.read_csv(data_file_path, error_bad_lines=False, engine="python") # One of the lines (5945667) apparently contains an EOF-character

df = balanced_dataframe
print(df.head())
print("Length of corpus:", df.shape[0])

                                                 training  label
250931  ther pros living security gate hour security p...      1
141366  ordered food online came minutes later right t...      4
67132   cleveland clinic doctors great terrible commer...      0
95021   second time bring home goodies familia count h...      3
193355  margarias brisket tacos amazing place hasn't o...      4
Length of corpus: 103710


In [7]:
# Get corpus
corpus = df['training'].tolist()

# Tokenize the corpus with the `vocabulary_size` most frequent words
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print('Total words:', total_words)
print('#unique tokens:', total_words-1)

# Create training vectors with padding, where applicable, is at the end 
X = tokenizer.texts_to_sequences(corpus)
X = pad_sequences(X, maxlen=max_review_size)
print('Shape of training data:', X.shape)

# Get labels
Y = df['label'].to_numpy()
print('Shape of label tensor:', len(Y))

Total words: 64821
#unique tokens: 64820
Shape of training data: (103710, 100)
Shape of label tensor: 103710


In [8]:
# Split features and labels into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print("Shape of training features: ", X_train.shape)
print("Shape of training labels: ", Y_train.shape)
print("Shape of test features: ", X_test.shape)
print("Shape of test labels: ", Y_test.shape)

Shape of training features:  (93339, 100)
Shape of training labels:  (93339,)
Shape of test features:  (10371, 100)
Shape of test labels:  (10371,)


In [9]:
# Define model
model = Sequential()
model.add(Embedding(vocabulary_size, 64, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(32))
model.add(Dense(5, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [10]:
# Load existing model or train a new one

# Define constants for training 
epochs = 5
batch_size = 128

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, verbose=1)
model.save(model_file_path)

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

accr = model.evaluate(X_test,Y_test, verbose=0)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


Train on 84005 samples, validate on 9334 samples
Epoch 1/5


UnknownError: Fail to find the dnn implementation.
	 [[{{node cu_dnnlstm/CudnnRNN}}]]
	 [[{{node loss/dense_loss/broadcast_weights/assert_broadcastable/is_valid_shape/has_valid_nonscalar_shape/has_invalid_dims/concat}}]]

In [None]:
print("hello")