<a href="https://colab.research.google.com/github/Razzf/MachineLearningTests/blob/master/TxtClsfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#importing modules
import tensorflow as tf
from tensorflow import keras
import numpy
from random import randint
import string

#loading the data
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [0]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(coded_text):
	return " ".join([reverse_word_index.get(i, "?") for i in coded_text])
 
def encode_review(text):
	encoded = [1]
	for word in text:
		if word.lower() in word_index:
			encoded.append(word_index[word.lower()])
		else:
			encoded.append(2)

	return encoded

# this function will return the decoded (human readable) reviews  

In [0]:
#funcs for debugging
def get_key(val): 
	for key, value in word_index.items(): 
		if val == value: 
			return key

def get_value(key):
	for keyy, value in word_index.items():
		if key == keyy:
			return value


In [124]:
print(train_data[0], "\n")
print(decode_review(train_data[0]))

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32] 

<START> this film was just brilliant casting location scen

In [0]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding="post", maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding="post", maxlen=250)

In [0]:
#model architecture

model = keras.Sequential()
model.add(keras.layers.Embedding(80000, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.summary()

model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])


In [0]:

x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]


In [0]:
fitModel = model.fit(x_train, y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1)

In [129]:
results = model.evaluate(test_data, test_labels)
print(results)
model.save("model.h5")


[0.3255917429924011, 0.8720800280570984]


In [152]:

#model = keras.models.load_model("model.h5")

with open("text.txt", encoding="utf-8") as f:
 for line in f.readlines():
  nline = line.translate(str.maketrans('', '', string.punctuation))
  nline = nline.split(" ")
  encode = encode_review(nline)
  encode = keras.preprocessing.sequence.pad_sequences([encode], value=word_index["<PAD>"], padding="post", maxlen=250) # make the data 250 words long
  predict = model.predict(encode)
  print(line)
  print(encode)
  print(predict[0])
  


The images were brilliant the acting was top notch and everything was blended together by Hans Zimmer and his Music. 169 minutes flew by me with my eyes fixed on the screen and my heart racing. And there it was.. The ending. I couldn't believe it . I was reliving, rethinking the movie while the credit scenes rolled enjoying the moment, the smell of popcorn, my comfortable seat and what do i see next to me? Ninety percent of the people in the cinema rushing outside after the first second of the credit scenes.
[[    1     4  1218    71   530     4   116    16   350  2501     5   285
     16 10792   295    34  8360 20952     5    27   228 51306   234  8754
     34    72    19    61   523  8533    23     4   268     5    61   483
   5969     5    50    12    16     4   277    13 24882   264    12     2
     13    16 18352 51743     4    20   137     4  1109   139  4985  2961
      4   561     4  7625     7  3912    61  3969  2224     5    51    81
     13    67   375     8    72  7774  892

In [131]:

rand = randint(0, 10000)
coded_test_review = test_data[rand]
print(coded_test_review)
prediction = model.predict([coded_test_review])

print("Rewiev: ", '\n', decode_review(coded_test_review))

print("Prediction: " + str(prediction[0]))
print("Actual: " + str(test_labels[rand]))
print(results)

[   1   13  188 1070    7    6    2 1039    7   14   13   69  110   12
  450  153  596   61   64 1136  585   69   77  736 1067 1716  121   12
   16    2    2   31  115   66  214    6  580    8   79   83  134  102
   88    7   32    4    2   13  119    4    2    7 5689  189  102   36
  202  178    6 8291 9421   23    4 4824 8362  751    4    2  134   71
   50    8 2910  178   39    4  873 3737 4710   21   36  144  216   11
 8747  348    4 3842    7   55 1058 9415   10   10   14  843   31 4137
   18   49  282  466   94    2   93    7    4  538    2   26   93    7
   36   70  131    2  125  945    8  945    2    5    2 4025 7061   10
   10  553   45   53  253   86    4 5662 5211  420   37  219    4 1549
  155    2   27 6055   11    6 1550   12   43 5291   31   67   35    2
   17  194   17    6  313  401  129 1642 2925   13  594   48    4  338
  232  188    8  140  344  103   36  258    4 5698   10   10  900   14
    9    6  184 1981  781   12 1160    4  644 2048    8  216   56   19
    6 