## Install and Import needed packages

In [None]:
!pip install anvil-uplink
!pip install tensorflow-text

In [None]:
import numpy as np
import pandas as pd

import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_text as tf_text

import matplotlib.pyplot as plt

## Functions

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
 def get_data():
    data_path = "./labeled_data.csv.zip"
    df = pd.read_csv(data_path, index_col=0)
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [None]:
def get_summary(df):

    content = df["tweet"].values
    word_tok = [word.lower() for item in content for word in nltk.word_tokenize(item)]
    st_words = set(word_tok)

    fact = {
        "TotalCount": len(content),
        "TotalWords": len(word_tok),
        "TotalUniqueWords": len(st_words),
        "MeanWordsPerTweet": len(word_tok) / len(content),
    }

    return fact, df.describe()

## Load the Dataset

In [None]:
raw_tweets = get_data()
raw_tweets.head()

## Expolore the Dataset

In [None]:
f, s = get_summary(raw_tweets)
print(f)
print(s)

In [None]:
ax = raw_tweets.groupby('class').count().plot(
    kind='bar',
    title='Distribution of data',
    legend=True
).set_xticklabels([
    'Hate Speech',
    'Offensive Language',
    'Neither'
], rotation=0)

In [None]:
raw_tweets.loc[raw_tweets['neither'] > 0, 'class'] = 2
raw_tweets.loc[raw_tweets['offensive_language'] > 0, 'class'] = 1
raw_tweets.loc[raw_tweets['hate_speech'] > 0, 'class'] = 0

In [None]:
raw_tweets = raw_tweets.drop(['count', 'hate_speech', 'offensive_language', 'neither'], axis=1)
raw_tweets.head()

In [None]:
ax = raw_tweets.groupby('class').count().plot(
    kind='bar',
    title='Distribution of data',
    legend=True
).set_xticklabels([
    'Hate Speech',
    'Offensive Language',
    'Neither'
], rotation=0)

## Prepare the DataSet for training

In [None]:
usable_tweets = raw_tweets[raw_tweets['class'] != 1]
usable_tweets.head()

In [None]:
mask = usable_tweets['class'] == 2
hate = usable_tweets[~mask]
neither = usable_tweets[mask]
print(neither.shape[0])
hate = hate.sample(n=neither.shape[0])
print(hate.shape[0])

In [None]:
frames = [hate, neither]
eqaulized = pd.concat(frames, axis=0)
eqaulized = eqaulized.reset_index(drop=True)
eqaulized

In [None]:
ax = eqaulized.groupby('class').count().plot(
    kind='bar',
    title='Distribution of data',
    legend=True
).set_xticklabels([
    'Hate Speech',
    'Neither'
], rotation=0)

In [None]:
eqaulized.loc[eqaulized['class'] == 0, 'class'] = 1
eqaulized.loc[eqaulized['class'] == 2, 'class'] = 0

In [None]:
raw_data = usable_tweets
train_data = eqaulized.sample(frac = 0.8)
test_data = eqaulized.drop(train_data.index)
train_data

In [None]:
raw_tweets, raw_sentiment = list(raw_data['tweet']), list(raw_data['class'])
test_tweets, test_sentiment = list(test_data['tweet']), list(test_data['class'])
train_tweets, train_sentiment = list(train_data['tweet']), list(train_data['class'])

In [None]:
raw_tweets_ds = tf.convert_to_tensor(raw_tweets)
raw_sentiment_ds = tf.convert_to_tensor(raw_sentiment)
test_tweets_ds = tf.convert_to_tensor(test_tweets)
test_sentiment_ds = tf.convert_to_tensor(test_sentiment)
train_tweets_ds = tf.convert_to_tensor(train_tweets)
train_sentiment_ds = tf.convert_to_tensor(train_sentiment)

In [None]:
raw_ds = tf.data.Dataset.from_tensors((raw_tweets, raw_sentiment))
test_ds = tf.data.Dataset.from_tensors((test_tweets, test_sentiment))
train_ds = tf.data.Dataset.from_tensors((train_tweets, train_sentiment))

for example, label in train_ds.take(1):
  print('text: ', example[0].numpy())
  print('label: ', label[0].numpy())
train_ds.take(1)

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
train_dataset = train_ds.shuffle(BUFFER_SIZE).repeat(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_ds.repeat(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
train_dataset

## Create Encoder

In [None]:
VOCAB_SIZE = 38016
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())

In [None]:
encoded_example = encoder(example[:3]).numpy()
encoded_example

In [None]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

## Building the model

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
print([layer.supports_masking for layer in model.layers])

In [None]:
sample_text = ('RT @SkylarLogsdon: @viva_based bruh you fucked the shit out of my taste buds with that bitch')
predictions = model.predict(np.array([sample_text]))
print(predictions)

In [None]:
padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions)

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
model.summary()

## Triaining the model

In [None]:
epochs = 2  #@param {type: "slider", min: 1, max: 10}

In [None]:
history = model.fit(train_dataset, epochs=epochs,
                    validation_data=test_dataset, validation_steps=30)

In [None]:
test_loss, test_acc = model.evaluate(test_ds)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
def get_string_label(score):
    score = score[0][0]
    if score < .5:
        return "Neither"
    return "Hate Speech"

In [None]:
sample_text = ('RT @SkylarLogsdon: @viva_based bruh you fucked the shit out of my taste buds with that bitch')
predictions = model.predict(np.array([sample_text]))
print(get_string_label(predictions))

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

# User Interaction

In [None]:
tweet = "\"I hate people\""  #@param {type: "string"}
print(get_string_label(model.predict(np.array([tweet]))))