**Load Trained Model**

In [1]:
from keras.models import model_from_json

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
drive_url = "/content/gdrive/My Drive/Hackathon/"
model_json = drive_url+"cnn_model.json"
weights = drive_url+'cnn_weights.h5'
with open(model_json, "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights(weights)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


**Load Test Data**

In [0]:
import pandas as pd
import numpy as np

In [0]:
test_data_url = drive_url+"mixed_test_reviews.txt"
test_data = pd.read_csv(test_data_url, sep = '\n', header=None)

**Vectorize Test Data**

In [4]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2019-04-17 21:24:41--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.98.93
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.98.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/root/input/GoogleNews-vectors-negative300.bin.gz’


2019-04-17 21:25:17 (44.1 MB/s) - ‘/root/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [10]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz'
word_vectors = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [0]:
def tokenize_and_vectorize(dataset):
  tokenizer = TreebankWordTokenizer()
  vectorized_data = []
  iteration = 0
  for sample in dataset:
    tokens = tokenizer.tokenize(sample)
    sample_vecs = []
    for token in tokens:
      try:
        sample_vecs.append(word_vectors[token])
      except KeyError:
        pass # No matching token in the Google w2v vocab
    vectorized_data.append(sample_vecs)
    iteration +=1
  print(iteration)
  return vectorized_data

In [16]:
vectorized_data = tokenize_and_vectorize(test_data[0])

4000


In [0]:
maxlen = 300
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [0]:
def pad_trunc(data, maxlen):
  """
  For a given dataset pad with zero vectors or truncate to maxlen
  """
  new_data = []

  # Create a vector of 0s the length of our word vectors
  zero_vector = []
  for _ in range(len(data[0][0])):
    zero_vector.append(0.0)

  for sample in data:
    if len(sample) > maxlen:
      temp = sample[:maxlen]
    elif len(sample) < maxlen:
      temp = sample
      # Append the appropriate number 0 vectors to the list
      additional_elems = maxlen - len(sample)
      for _ in range(additional_elems):
        temp.append(zero_vector)
    else:
      temp = sample
    new_data.append(temp)
  return new_data

In [0]:
X_test = pad_trunc(vectorized_data, maxlen)

In [0]:
X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))

In [0]:
predications = model.predict_classes(X_test)

In [27]:
predications

array([[1],
       [1],
       [1],
       ...,
       [0],
       [1],
       [1]], dtype=int32)

**Compare with actual data to find accuracy**

In [0]:
target_data_url = drive_url+"mixed_test_targets.txt"
target_data = pd.read_csv(target_data_url, sep = '\n', header=None)

In [34]:
target_data

Unnamed: 0,0,predications
0,0,1
1,0,1
2,0,1
3,1,0
4,0,1
5,1,0
6,1,0
7,0,1
8,0,1
9,0,1


In [0]:
target_data['predications'] = predications

In [0]:
count = 0
for index, row in target_data.iterrows():
  if row['predications'] != row[0]: #model was trained with opposite of targets i.e. real = 0 fake = 1
    count += 1
accuracy = count/4000

In [36]:
accuracy

0.92425