## Config

In [11]:
import pandas as pd
import numpy as np

import os
import re
import string
from nltk.corpus import stopwords

import tensorflow as tf
from sklearn.model_selection import train_test_split

import json
from keras_preprocessing.text import tokenizer_from_json


# WORD-level
MAX_SEQ_LENGTH    = 500
MAX_NUM_WORDS     = 15000

## Load Data

In [57]:
## Input data here. For now hard coded, but want a list of sentences to check
X_test = [
        "Some 79 percent fretted about heightened likelihood of military conflict and 73 percent saw rising risks of an erosion of world trading rules.",
         "The European Commission, the European Union's executive arm, found that Qualcomm's practices had a significant, detrimental impact on competition in the region.",
         "One study found that for cancer patients considering experimental chemotherapy, trust in their physician was one of the most important reasons they enrolled in a clinical trial -- on par with the belief that the treatment would be effective.",
         "Three weeks after the intrusion came to light, American officials are still trying to understand whether what the Russians pulled off was simply an espionage operation inside the systems of the American bureaucracy or something more sinister, inserting “backdoor” access into government agencies, major corporations, the electric grid and laboratories developing and transporting new generations of nuclear weapons.",
         "But with a new administration taking office in three weeks, some analysts say the Russians may be trying to shake Washington’s confidence in the security of its communications and demonstrate their cyberarsenal to gain leverage against President-elect Joseph R. Biden Jr. before nuclear arms talks.",
         "Eight weeks later, General Nakasone and other American officials responsible for cybersecurity are now consumed by what they missed for at least nine months: a hacking, now believed to have affected upward of 250 federal agencies and businesses, that Russia aimed not at the election system but at the rest of the United States government and many large American corporations.",
        "In 1966, more than three-fourths of Americans had great confidence in medical leaders; today, only 34 percent do."
         "That level of self-awareness is quite elusive: Although some 95 percent of people think they're self-aware, only about 10 percent to 15 percent truly are, according to one study.",
        "Additionally, using data on latitude and longitude reported in the tickets, we found that increases in this extreme speeding were concentrated in areas close to movie theaters (often within two miles), consistent with speeding behavior induced by moviegoing.",
"It's not just that 71 percent of Americans oppose federal government efforts to stop marijuana sales, but an equally large majority thinks overall drug abuse should be treated as an addiction and mental health problem, rather than a criminal offense.",
"A big reason: 87 percent of respondents, including 79 percent of Republicans, said in one poll that insurers should be required to cover people with pre-existing conditions.",
"According to the most recent statistics, more than a million people a year are arrested for simple drug possession in the United States -- and more than half a million of those arrests are for marijuana possession."]




       

## Preprocess Data

In [58]:
def clean_doc(doc):
    """
    Cleaning a document by several methods:
        - Lowercase
        - Removing whitespaces
        - Removing numbers
        - Removing stopwords
        - Removing punctuations
        - Removing short words
    """
    stop_words = set(stopwords.words('english'))
    
    # Lowercase
    doc = doc.lower()
    # Remove numbers
    #doc = re.sub(r"[0-9]+", "", doc)
    # Split in tokens
    tokens = doc.split()
    # Remove Stopwords
    tokens = [w for w in tokens if not w in stop_words]
    # Remove punctuation
    tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in tokens]
    # Tokens with less then two characters will be ignored
    tokens = [word for word in tokens if len(word) > 1]
    return ' '.join(tokens)

In [59]:
X_test = [clean_doc(x) for x in X_test]

# load tokenizer
with open('models/combined-tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

sequences_test = tokenizer.texts_to_sequences(X_test)
X_test_word    = tf.keras.preprocessing.sequence.pad_sequences(sequences_test, maxlen=MAX_SEQ_LENGTH, padding='post')

## Load Model

In [60]:
cnn_ = tf.keras.models.load_model("models/model-combined-1.h5")

## Run Prediction

In [61]:
predictions = cnn_.predict(X_test_word, batch_size=10, verbose=0)
print(predictions)

# classify (is insight?)
rounded_predictions = [round(rp[1]) for rp in predictions]
print(rounded_predictions)

[[0.88195443 0.11804556]
 [0.89204884 0.10795109]
 [0.37104332 0.6289567 ]
 [0.94112545 0.05887448]
 [0.98701906 0.01298088]
 [0.9243415  0.07565851]
 [0.359622   0.640378  ]
 [0.46854678 0.53145325]
 [0.60797894 0.3920211 ]
 [0.6159451  0.3840549 ]
 [0.28326824 0.7167318 ]]
[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1]
