In [None]:
%tensorflow_version 1.x
import os
import tarfile
import shutil
import re

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
!pip install bert-tensorflow
!pip install -q gpt-2-simple
import gpt_2_simple as gpt2
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization




In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)

OUTPUT_DIR = 'tmp'

gpt2.mount_gdrive()

gpt2.copy_file_from_gdrive("bert_gan_real.csv")

real = pd.read_csv('bert_gan_real.csv')
fakes = []
for i in ["400", "800", "1200", "1600"]:
  gpt2.copy_file_from_gdrive(f"bert_gan_fake{i}.csv")
  fakes.append(pd.read_csv(f"bert_gan_fake{i}.csv"))
fake = pd.concat(fakes)
fake['synopsis']=fake['synopsis'].astype(str)
fake['real']=0
df = pd.concat([real,fake])[['tag','synopsis','real']].dropna()


INPUT_COLUMN = 'tag'
DATA_COLUMN = 'synopsis'
LABEL_COLUMN = 'real'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

train, test = train_test_split(df, test_size=0.1)

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[INPUT_COLUMN], 
                                                                   text_b = x[DATA_COLUMN], 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[INPUT_COLUMN], 
                                                                   text_b = x[DATA_COLUMN], 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 64
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

def getPrediction(in_sentence_pairs):
  labels = ["Fake", "Real"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x[0], text_b = x[1], label = 0) for x in in_sentence_pairs] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return pd.DataFrame([(sentence[0], sentence[1], np.exp(prediction['probabilities'][1]), labels[prediction['labels']]) for sentence, prediction in zip(in_sentence_pairs, predictions)], columns=['tag', 'synopsis', 'prob_real','label'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
Training took time  0:00:00.006528


In [None]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


{'auc': 0.99701625,
 'eval_accuracy': 0.9988889,
 'f1_score': 0.99465233,
 'false_negatives': 1.0,
 'false_positives': 1.0,
 'global_step': 1518,
 'loss': 0.0053575495,
 'precision': 0.9946524,
 'recall': 0.9946524,
 'true_negatives': 1612.0,
 'true_positives': 186.0}

In [None]:
real_sent = test.loc[test['real']==1,['tag','synopsis']].values.tolist()
predictions_real = getPrediction(real_sent)
predictions_real.sort_values('prob_real')

Unnamed: 0,tag,synopsis,prob_real,label
180,rpg adventure indie,"They came up with the name, story and gameplay...",0.040982,Fake
14,adventure,Using even more combinations of items througho...,0.979608,Real
77,indie,"In this game you are in a Chemistry lab, your ...",0.989638,Real
35,adventure indie,The player will do whatever it takes to avoid...,0.991251,Real
169,indie,How will you fare against the waves of failed ...,0.994333,Real
...,...,...,...,...
52,adventure indie,government hire him -- a dealer of high-end an...,0.999906,Real
95,adventure,Touted as the most romantic event to grace the...,0.999913,Real
184,adventure,It's spring fair time and The Great Cow Race i...,0.999917,Real
15,strategy,Cinemaware's Anthology of classic games gives ...,0.999926,Real


In [None]:
x=["sci-fi[SEP] In the game you're playing as an agent of the company that manufactures the device called the T1. The task is to find out what is going on with the device",
 "sci-fi[SEP] In addition to the game's classic gameplay, the game also offers the ability to create custom levels for any size from a few hundred to a few thousand players. (We will be adding as we add additional content).Included in the game is the ability to create custom levels for any size from a few hundred to a few thousand players.\n\nThe game has many levels, including many levels inspired by classic games such as Age of Mythology, Zeos, etc.\n\nThe game has multiple difficulty settings, from simple to hardcore, which are sometimes customized for each user based upon their feedback.\n\nWe will be adding as we add additional content.\n\nThe game has many levels, including many levels inspired by classic games such as",
 'sci-fi[SEP]                                                                                                                                                      ',
 'sci-fi[SEP]                                                                                                                                                      ',
 'sci-fi[SEP] The game is a mix of classic games, with "tons of new features" which adds an extra challenge.The game has three difficulty settings: easy, normal, hardcore.                                                                                                                   ',
 "sci-fi[SEP] \n\nA very different game set in a sci-fi setting, 'The Wing Commander' is a political simulator set in near future, where you can choose one or more of the 7 alien races and have to manage your military, crafting, logistics, and research capabilities. \n\nThe most powerful alien civilization is the 'Celestial Alliance', which is attacking Earth in order to conquer the solar system and have total control over all human technology.  \n\nExplore and manage your empire.\n\nIn THE WING COMMANDER you play as the commander of a human expedition in an alien spaceship, who is able to monitor and control every aspect of your world",
 "sci-fi[SEP]\n\nInto the heart of the city. You have been chosen by a secret society to hide out in a cave for the next 20 days. You must complete a series of challenges set by the Guardian to gain access to your cave and escape. The Guardian has to solve the puzzles, learn how to pick your way through the city and gather all the Pieces of the Guardian's Key that will allow you to pass the 20 days.\n\nThe game has no save feature",
 'sci-fi[SEP]                                                                                                                                                      ',
 'sci-fi[SEP]                                                                                                                                                      ',
 'sci-fi[SEP] Many hours of gameplay.Frequently Asked Questions:Q: "What\'s the difference between a shooting game and a VR game?"A: "Sniper VR" is a shooter game.You do not need to buy a headset.']

In [None]:
X = [re.sub('\n', '', i) for i in x if i.strip() != "sci-fi[SEP]"]
synopsis = [[s[:6], s[12:]] for s in X]

In [None]:
results = getPrediction(synopsis)

In [None]:
results.sort_values(by='prob_real', ascending=False)

Unnamed: 0,tag,synopsis,prob_real,label
3,sci-fi,"A very different game set in a sci-fi setting,...",0.998131,Real
4,sci-fi,nto the heart of the city. You have been chose...,0.045445,Fake
5,sci-fi,Many hours of gameplay.Frequently Asked Questi...,0.016247,Fake
1,sci-fi,"In addition to the game's classic gameplay, th...",0.000429,Fake
0,sci-fi,In the game you're playing as an agent of the ...,0.000213,Fake
2,sci-fi,"The game is a mix of classic games, with ""tons...",0.000174,Fake
