In [1]:
import pandas as pd 
import numpy as np 
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Flatten, Dropout
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam

import matplotlib.pyplot as plt

from gensim.models import KeyedVectors

import yaml
from keras.models import model_from_yaml
np.random.seed(666) 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/koalachelsea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using TensorFlow backend.


# Test with Example

In [2]:
unsup_text = "Like TWO ARABIAN KNIGHTS, a couple of American soldiers attempt the daring rescue of a Middle Eastern princess from a loveless betrothal.<br /><br />Producer Howard Hughes became a Hollywood power with the very successful release of this, his third motion picture. Thought lost for decades, this wonderful silent comedy has recently been rediscovered & restored and given a splendid orchestral score by Robert Israel. Directed with verve by Lewis Milestone and greatly benefiting from William Cameron Menzies' art direction, the high jinks & high adventure of this antique buddy film are once again ready to delight the viewing audience.<br /><br />Clean-cut private William Boyd and plug-ugly sergeant Louis Wolheim battle Germans, Arabs and each other across Europe, the Mediterranean and into Palestine. They make a terrific comedy duo, constantly involved in one-upmanship and dangerous exploits whether in a POW camp, on a prisoner train, aboard a tramp steamer, or in a Moslem souk and emir's palace. Wolheim, with his hilariously expressive face, has a slight advantage in the scene stealing category, while Boyd has the upper hand in the romantics department.<br /><br />Mary Astor, as the endangered princess, is the willing recipient of Boyd's attentions. Her role doesn't give her a great deal to do except look lovely & alarmed, but these she carries off admirably.<br /><br />In the supporting cast, Michael Visaroff is the black hearted ship's captain who comes into conflict with Boyd & Wolheim; look fast for Boris Karloff as his purser. Dashing Ian Keith nicely plays the young Arab chieftain who will stop at nothing to make Astor his bride.<br /><br />At various points throughout the movie the viewer will notice the deterioration of the film stock, showing that TWO ARABIAN KNIGHTS was indeed rescued, like the princess, just in time."

In [3]:
def clean_text(text):
    # Remove url
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ', str(text))
    
    # Remove hashtags
    # only removing the hash # sign from the word, we believe hashtags contains useful information
    text = re.sub(r'#', '', str(text))
    
    # replace consecutive non-ASCII characters with a space
    text = re.sub(r'[^\x00-\x7F]+', ' ', str(text))
    
    # Clean the text
    text = re.sub(r'<br />', ' ', str(text))
    return text

In [4]:
# Clean the unsup text
clean_text = clean_text(unsup_text)
clean_text

"Like TWO ARABIAN KNIGHTS, a couple of American soldiers attempt the daring rescue of a Middle Eastern princess from a loveless betrothal.  Producer Howard Hughes became a Hollywood power with the very successful release of this, his third motion picture. Thought lost for decades, this wonderful silent comedy has recently been rediscovered & restored and given a splendid orchestral score by Robert Israel. Directed with verve by Lewis Milestone and greatly benefiting from William Cameron Menzies' art direction, the high jinks & high adventure of this antique buddy film are once again ready to delight the viewing audience.  Clean-cut private William Boyd and plug-ugly sergeant Louis Wolheim battle Germans, Arabs and each other across Europe, the Mediterranean and into Palestine. They make a terrific comedy duo, constantly involved in one-upmanship and dangerous exploits whether in a POW camp, on a prisoner train, aboard a tramp steamer, or in a Moslem souk and emir's palace. Wolheim, wit

In [5]:
# word tokenize, removal stop words and stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)
result = []
for word in word_tokenize(clean_text):
    if word.isalpha():
        if word not in stop_words:
            result.append(stemmer.stem(word))
string = ' '.join(result)

In [6]:
# Transform to sequence
max_features = 5000
maxlen = 100
# initialize the tokenizer with a 5000 word limit
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(string)
vocab_size = len(tokenizer.word_index) + 1
# list of texts to turn to sequences.
string = tokenizer.texts_to_sequences(string)
# padding
str_padded_seq = pad_sequences(string, padding='post', maxlen=maxlen)

In [7]:
print('loading model......')
with open('model/lstm.yml', 'r') as f:
    yaml_string = yaml.load(f)
model = model_from_yaml(yaml_string)

print('loading weights......')
model.load_weights('model/lstm.h5')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

loading model......


  This is separate from the ipykernel package so we can avoid doing imports until


loading weights......


In [8]:
result = model.predict_classes(str_padded_seq)
if result[0]==1:
    print(unsup_text, '\n positive')
elif result[0]==0:
    print(unsup_text, '\n negative')

Like TWO ARABIAN KNIGHTS, a couple of American soldiers attempt the daring rescue of a Middle Eastern princess from a loveless betrothal.<br /><br />Producer Howard Hughes became a Hollywood power with the very successful release of this, his third motion picture. Thought lost for decades, this wonderful silent comedy has recently been rediscovered & restored and given a splendid orchestral score by Robert Israel. Directed with verve by Lewis Milestone and greatly benefiting from William Cameron Menzies' art direction, the high jinks & high adventure of this antique buddy film are once again ready to delight the viewing audience.<br /><br />Clean-cut private William Boyd and plug-ugly sergeant Louis Wolheim battle Germans, Arabs and each other across Europe, the Mediterranean and into Palestine. They make a terrific comedy duo, constantly involved in one-upmanship and dangerous exploits whether in a POW camp, on a prisoner train, aboard a tramp steamer, or in a Moslem souk and emir's p