<a href="https://colab.research.google.com/github/XinyueZ/flutter-web-profanity-check/blob/master/machine_learning/profanity_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [0]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder

assert tf.__version__.startswith('2')

import itertools

import re
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
layers = keras.layers

In [34]:
tf.__version__

'2.2.0'

In [0]:
#Download dataset
URL = "https://dl.dropbox.com/s/ewpit86gekpiwk5/hate_dirty_peech_labeled_data.tsv"
path = tf.keras.utils.get_file(URL.split('/')[-1], URL)

In [0]:
# Convert the data to a Pandas data frame
data = pd.read_csv(path, sep="\t")

In [37]:
# Shuffle the data
data = data.sample(frac=1)

# Print the first first five rows as default
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
3762,3877,3,0,1,2,2,@KaylahPrettyMom lmao girlllaaaa I don't like ...
2103,2153,3,0,3,0,1,This be me &#1041204;&#1041204;&#1041204; Cu...
21729,22200,4,0,4,0,1,These bitches really be out here on backpage h...
21195,21662,3,1,2,0,1,Stupid bitches.
22637,23122,3,0,3,0,1,What's this bitch look like y'all &#8220;@pear...


In [38]:
# Clean data
data = data[pd.notnull(data['class'])]
data = data[pd.notnull(data['tweet'])]
data = data.drop(data.columns[0], axis=1) 
data = data.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither']) 
 
data["tweet"] = data["tweet"].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem)) 

data["label"] = data["class"]

# class = class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither
data['class'].astype(str)
data = data.replace({'class': {0: "hate speech", 1: "offensive language", 2: "neither"}})

# Print the first first five rows as default
data.head()

Unnamed: 0,class,tweet,label
3762,neither,lmao girlllaaaa i dont like em chunky anymore...,2
2103,offensive language,this be me 104120410412041041204 cuh went fr...,1
21729,offensive language,these bitches really be out here on backpage h...,1
21195,offensive language,stupid bitches,1
22637,offensive language,whats this bitch look like yall 8220 add me on...,1


In [39]:
# Split data into train and test
train_size = int(len(data) * .95)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 23537
Test size: 1239


In [0]:
# Features to train
tweet_train = data['tweet'][:train_size]
class_train = data['class'][:train_size]

# Labels (class types)
labels_train = data['label'][:train_size]

In [0]:
# Features for test
tweet_test = data['tweet'][train_size:]
class_test = data['class'][train_size:]

# Labels for test
labels_test = data['label'][train_size:]

In [0]:
vocab_size = 15000

In [0]:
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(tweet_train)

In [0]:
bow_train = tokenize.texts_to_matrix(tweet_train)
bow_test = tokenize.texts_to_matrix(tweet_test)

In [0]:
encoder = LabelEncoder()
encoder.fit(class_train)
class_train = encoder.transform(class_train)
class_test = encoder.transform(class_test)
num_classes = np.max(class_train) + 1

In [0]:
class_train = keras.utils.to_categorical(class_train, num_classes)
class_test = keras.utils.to_categorical(class_test, num_classes)

In [0]:
bow_inputs = layers.Input(shape=(vocab_size,))
class_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, class_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, class_inputs], outputs=predictions)

In [0]:
loss = "mse"
optimizer = tf.keras.optimizers.Adam()

In [49]:
wide_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
wide_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 15000)]      0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 15003)        0           input_4[0][0]                    
                                                                 input_5[0][0]                    
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 256)          3841024     concatenate_2[0][0]        

In [0]:
train_embed = tokenize.texts_to_sequences(tweet_train)
test_embed = tokenize.texts_to_sequences(tweet_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen=max_seq_length, padding="post")

In [51]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
deep_model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 170)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 170, 8)            120000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 1360)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 1361      
Total params: 121,361
Trainable params: 121,361
Non-trainable params: 0
_________________________________________________________________


In [0]:
deep_model.compile(loss=loss, optimizer=optimizer,  metrics=['accuracy'])

In [53]:
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
combined_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
combined_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 15000)]      0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 170)]        0                                            
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 15003)        0           input_4[0][0]                    
                                                                 input_5[0][0]              

### Train the model

In [0]:
epochs = 10
batch_size = 128

In [55]:
history = combined_model.fit([bow_train, class_train] + [train_embed], labels_train, epochs=epochs, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
combined_model.evaluate([bow_test, class_test] + [test_embed], labels_test, batch_size=batch_size)



[0.0019928913097828627, 0.8167877197265625]

In [0]:
predictions = combined_model.predict([bow_test, class_test] + [test_embed])

In [58]:
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(tweet_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

8220  coolin my nig   voice8221 uhhee   laugh
Predicted:  1.0074544 Actual:  1 

lookin at u hoes federalboobieinspector 
Predicted:  0.9991706 Actual:  1 

 im a lil nigga wait till you see nigga you aint ready im from chicago boy i aint no pussy from the burbs just wait
Predicted:  0.9867027 Actual:  1 

this does create a slippery slope tho  players gonna have to watch they mouth too  no tolerance for ignorance anymore
Predicted:  1.9856583 Actual:  2 

 bitch i juss askd yu uh question yo ole choppa bullet head ahh
Predicted:  0.97320986 Actual:  1 

8220 8220 lmao nvm8221 thats right bitch8221 go drink some warm milk and go to bed
Predicted:  1.0005717 Actual:  1 

 straight pussy
Predicted:  0.9939398 Actual:  1 

aint shit without my scale bitch i live that bag life
Predicted:  0.97669524 Actual:  1 

    lmao nigga yo avi so trash my phone wont even load that shit
Predicted:  0.98152834 Actual:  1 

  main bitches saying i hate a send me a pic ass nigga be answering every singl

In [59]:
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  0.014657113933935761


### Generate TF model

In [60]:
saved_model_dir = '/content/profanity_check_tuning'
tf.saved_model.save(combined_model, saved_model_dir)

INFO:tensorflow:Assets written to: /content/profanity_check_tuning/assets
