# Load and Predict

In [None]:
import tensorflow as tf

model = tf.keras.models.load_model('hate_detection_model')

# view model architecture to confirm we have save and loaded correctly
model.summary()

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prep_data(text):
    tokens = tokenizer.encode_plus(text, max_length=512,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_token_type_ids=False,
                                   return_tensors='tf')
    # tokenizer returns int32 tensors, we need to return float64, so we use tf.cast
    return {'input_ids': tf.cast(tokens['input_ids'], tf.float64),
            'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)}

In [None]:
probs = model.predict(prep_data("stinken folks"))[0]

probs

Read Test data

In [None]:
import pandas as pd

# so we can see full phrase
pd.set_option('display.max_colwidth', None)

dfa = pd.read_table('Data/Tweet_test - Sheet1.tsv')
dfa.head()

In [None]:
import re

Clean Tweets

In [None]:
def clean(text):
  #Replace all digits with space
  text = re.sub(r"[\d-]",'',text)
  # Remove Unicode characters
  text = re.sub(r'[^\x00-\x7F]+', '',text)
  #Remove retweets
  text = re.sub('user', '', text)
  # Remove urls
  text = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', text)
  text = re.sub("[^a-zA-Z]", ' ',text)
  # Remove mentions:
  text = re.sub(r'@[A-Za-z0-9_]+', '', text)
  return text

dfa.Tweets = dfa.Tweets.apply(clean)

In [None]:
dfa

Predict Tweets

In [None]:
import numpy as np

In [None]:
dfa['Label'] = None

for i, row in dfa.iterrows():
    # get token tensors
    tokens = prep_data(row['Tweets'])
    # get probabilities
    probs = model.predict(tokens)
    # find argmax for winning class
    pred = np.argmax(probs)
    # add to dataframe
    dfa.at[i, 'Label'] = pred

dfa.head()

Save Predictions

In [None]:
dfa.to_csv('Data/predictions.csv')

In [None]:
dft = pd.read_csv('Data/Test_label.csv') # Gold data
dft.head()

In [None]:
dfa.head(15)

In [None]:
dft.tail(15)

In [None]:
dfa.tail(15)