<a href="https://colab.research.google.com/github/RihaChri/NLPReviews/blob/main/NLP_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Imports
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd     

#Keras Import
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Show version number
print("Tensorflow version: "+tf.__version__)
print("Keras version:      "+tf.keras.__version__)
ShowCaseExample = 2;

#Import of Dataset
dataset_X=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP-MovieReviews/IMDB Dataset.csv', engine='python', usecols = [0])
dataset_Y=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP-MovieReviews/IMDB Dataset.csv', engine='python', usecols = [1])

#Split into train and test set
splitVector = np.random.rand(len(dataset_Y)) <= 0.7
train_X = dataset_X[splitVector]
test_X = dataset_X[~splitVector]
train_Y = dataset_Y[splitVector]
test_Y = dataset_Y[~splitVector]

#convert to numpy array
train_X = np.array(train_X)
test_X = np.array(test_X)
train_Y = np.array(train_Y)
test_Y = np.array(test_Y)
dataset_X = np.array(dataset_X)
dataset_Y = np.array(dataset_Y)

#convert the Y strings 'positive' & 'negative' to 0 and 1
for x in train_Y:
   if str(x[0])=='positive': x[0]=1.0
   if str(x[0])=='negative': x[0]=0.0
for x in test_Y:
   if str(x[0])=='positive': x[0]=1.0
   if str(x[0])=='negative': x[0]=0.0

train_Y = tf.convert_to_tensor(train_Y, dtype=tf.int64) 
test_Y = tf.convert_to_tensor(test_Y, dtype=tf.int64) 

print("Dimension of training set: X"+str(train_X.shape)+", Y"+str(train_Y.shape))
print("Dimension of test set: X"+str(test_X.shape)+", Y"+str(test_Y.shape))

#Tokenizer
num_words = 10000
#num_words = len(tokenizer.word_index)
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(dataset_X[:,0])
tokenizer.word_index

x_train_tokens = tokenizer.texts_to_sequences(train_X[:,0])
x_test_tokens = tokenizer.texts_to_sequences(test_X[:,0])

#checkout an example
print("Example review: "+str(train_X[ShowCaseExample,0]))
print("This example is: "+str(train_Y[ShowCaseExample,0]))
print("word index: "+str(tokenizer.word_index))
print("The indexed version of this is:\n "+str(x_train_tokens[ShowCaseExample]))

#number of tokens
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens] #returns a list with number of tokens for each X example
num_tokens = np.array(num_tokens)
print("number of tokens (words) in this example: "+str(num_tokens[ShowCaseExample]))
print("\n\nAverage number of tokens (words):"+str(np.mean(num_tokens)))
print("Max number of tokens (words):"+str(np.max(num_tokens)))

#Max allowed number of tokens
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("Allowed number of tokens (words): "+str(max_tokens))
print("This covers about "+str(np.sum(num_tokens < max_tokens) / len(num_tokens))+" of the dataset is covered")


#Padding
pad = 'pre' # Means that we add indexes "0" or throw away indexes at the beginning (pre) or end (post) of an example
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens, padding=pad, truncating=pad)
print("Train pad shape: "+str(x_train_pad.shape))
print("Test pad shape: "+str(x_test_pad.shape))
print("The padded version of this example is: "+str(x_train_pad[ShowCaseExample,:]))

#Tokenizer Inverse Map
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    
    # Concatenate all words.
    text = " ".join(words)

    return text


print("inversing indices of the example yields: "+str(tokens_to_string(x_train_tokens[ShowCaseExample])))

#Create the Recurrent Neural Network
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
model.summary()

#---------------------------
#Train the Recurrent Neural Network
model.fit(x_train_pad, train_Y[:,0], validation_split=0.05, epochs=3, batch_size=64)
#Performance on Test-Set
result = model.evaluate(x_test_pad, test_Y[:,0])


Tensorflow version: 2.7.0
Keras version:      2.7.0


FileNotFoundError: ignored

In [None]:

#Example of Mis-Classified Text
y_pred = model.predict(x=x_test_pad[0:1000])#predict first 1000 examples
y_pred = y_pred.T[0]
cls_pred = np.array([1.0 if p>0.5 else 0.0 for p in y_pred])#all above 0.5 are 1 else 0
cls_true = np.array(test_Y[0:1000,0])                       #show all that are actually true
incorrect = np.where(cls_pred != cls_true)                  #indices of the incorrect ones
incorrect = incorrect[0]                                    #take the first incorrect example
print("Länge des falschen Beispiels: "+ str(len(incorrect)))
idx = incorrect[0]
print("Index des falschen Beispiels: "+str(idx))
text = test_X[idx]
print("Text des falschen Beispiels: "+str(text))
print("Vorhersage des falschen Beispiels:"+str(y_pred[idx]))
print("Tatsächlicher Wert des falschen Beispiels: "+str(cls_true[idx]))

#New Data
text1 = "This movie is fantastic! I really like it because it is so good!"
text2 = "Good movie!"
text3 = "Maybe I like this movie."
text4 = "Meh ..."
text5 = "If I were a drunk teenager then this movie might be good."
text6 = "Bad movie!"
text7 = "Not a good movie!"
text8 = "This movie really sucks! Can I get my money back please?"
text9 = "This movie really moved me to tears."
text10 = "What a waste of time"
texts = [text1, text2, text3, text4, text5, text6, text7, text8, text9, text10]

tokens = tokenizer.texts_to_sequences(texts)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
print(tokens_pad.shape)

print(model.predict(tokens_pad))

#Embedding
layer_embedding = model.get_layer('layer_embedding')
weights_embedding = layer_embedding.get_weights()[0]
weights_embedding.shape
token_good = tokenizer.word_index['good']
print(token_good)
token_great = tokenizer.word_index['great']
print(token_great)
weights_embedding[token_good]
weights_embedding[token_great]
token_bad = tokenizer.word_index['bad']
token_horrible = tokenizer.word_index['horrible']
weights_embedding[token_bad]
weights_embedding[token_horrible]


#Sorted Words
def print_sorted_words(word, metric='cosine'):
    """
    Print the words in the vocabulary sorted according to their
    embedding-distance to the given word.
    Different metrics can be used, e.g. 'cosine' or 'euclidean'.
    """

    # Get the token (i.e. integer ID) for the given word.
    token = tokenizer.word_index[word]

    # Get the embedding for the given word. Note that the
    # embedding-weight-matrix is indexed by the word-tokens
    # which are integer IDs.
    embedding = weights_embedding[token]

    # Calculate the distance between the embeddings for
    # this word and all other words in the vocabulary.
    distances = cdist(weights_embedding, [embedding],
                      metric=metric).T[0]
    
    # Get an index sorted according to the embedding-distances.
    # These are the tokens (integer IDs) for words in the vocabulary.
    sorted_index = np.argsort(distances)
    
    # Sort the embedding-distances.
    sorted_distances = distances[sorted_index]
    
    # Sort all the words in the vocabulary according to their
    # embedding-distance. This is a bit excessive because we
    # will only print the top and bottom words.
    sorted_words = [inverse_map[token] for token in sorted_index
                    if token != 0]

    # Helper-function for printing words and embedding-distances.
    def _print_words(words, distances):
        for word, distance in zip(words, distances):
            print("{0:.3f} - {1}".format(distance, word))

    # Number of words to print from the top and bottom of the list.
    k = 10

    print("Distance from '{0}':".format(word))

    # Print the words with smallest embedding-distance.
    _print_words(sorted_words[0:k], sorted_distances[0:k])

    print("...")

    # Print the words with highest embedding-distance.
    _print_words(sorted_words[-k:], sorted_distances[-k:])

print_sorted_words('great', metric='cosine')
print_sorted_words('worst', metric='cosine')

Länge des falschen Beispiels: 121
Index des falschen Beispiels: 14
Text des falschen Beispiels: ["Bela Lugosi appeared in several of these low budget chillers for Monogram Studios in the 1940's and The Corpse Vanishes is one of the better ones.<br /><br />Bela plays a mad scientist who kidnaps young brides and kills them and then extracts fluid from their bodies so he can keep his ageing wife looking young. After a reporter and a doctor stay the night at his home and discover he is responsible for the brides' deaths, the following morning they report these murders to the police and the mad scientist is shot and drops dead shortly afterwards.<br /><br />You have got almost everything in this movie: the scientist's assistants consist of an old hag, a hunchback and dwarf (her sons), a thunderstorm and spooky passages in Bela's house. Bela and his wife find they sleep better in coffins rather than beds in the movie.<br /><br />The Corpse Vanishes is worth a look, especially for Bela Lugosi