<a href="https://colab.research.google.com/github/XinyueZ/flutter-web-profanity-check/blob/master/machine_learning/profanity_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [0]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder

assert tf.__version__.startswith('2')

import itertools

import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
layers = keras.layers

In [411]:
tf.__version__

'2.2.0'

In [0]:
#Download dataset
URL = "https://dl.dropbox.com/s/ewpit86gekpiwk5/hate_dirty_peech_labeled_data.tsv"
path = tf.keras.utils.get_file(URL.split('/')[-1], URL)

In [0]:
# Convert the data to a Pandas data frame
data = pd.read_csv(path, sep="\t")

In [414]:
# Shuffle the data
data = data.sample(frac=1)

# Print the first first five rows as default
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
22565,23048,3,0,3,0,1,Welp imma hoe again :/ RT @Gladvillian: She a ...
9916,10194,3,0,0,3,2,How is that not a yellow on Ronaldo for full o...
992,1021,3,1,2,0,1,&#128514;&#128514;&#128514;&#128514;&#128514;&...
22716,23201,3,0,3,0,1,When niggas wanna cheat they start an argument...
12865,13191,3,0,2,1,1,Molly's that bitch you love when she comes but...


In [415]:
# Clean data
data = data[pd.notnull(data['class'])]
data = data[pd.notnull(data['tweet'])]
data = data.drop(data.columns[0], axis=1) 
data = data.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither']) 
 
# Print the first first five rows as default
data.head()

Unnamed: 0,class,tweet
22565,1,Welp imma hoe again :/ RT @Gladvillian: She a ...
9916,2,How is that not a yellow on Ronaldo for full o...
992,1,&#128514;&#128514;&#128514;&#128514;&#128514;&...
22716,1,When niggas wanna cheat they start an argument...
12865,1,Molly's that bitch you love when she comes but...


In [416]:
# Split data into train and test
train_size = int(len(data) * .95)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 23537
Test size: 1239


In [0]:
# Features to train
tweet_train = data['tweet'][:train_size]
class_train = data['class'][:train_size]

# Labels (class types)
labels_train = data['class'][:train_size]

In [0]:
# Features for test
tweet_test = data['tweet'][train_size:]
class_test = data['class'][train_size:]

# Labels for test
labels_test = data['class'][train_size:]

In [0]:
vocab_size = 15000

In [0]:
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(tweet_train)

In [0]:
bow_train = tokenize.texts_to_matrix(tweet_train)
bow_test = tokenize.texts_to_matrix(tweet_test)

In [0]:
encoder = LabelEncoder()
encoder.fit(class_train)
class_train = encoder.transform(class_train)
class_test = encoder.transform(class_test)
num_classes = np.max(class_train) + 1

In [0]:
class_train = keras.utils.to_categorical(class_train, num_classes)
class_test = keras.utils.to_categorical(class_test, num_classes)

In [0]:
bow_inputs = layers.Input(shape=(vocab_size,))
class_inputs = layers.Input(shape=(num_classes,))
merged_layer = layers.concatenate([bow_inputs, class_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, class_inputs], outputs=predictions)

In [0]:
loss = "mse"
optimizer = tf.keras.optimizers.Adam()

In [426]:
wide_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
wide_model.summary()

Model: "model_33"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_34 (InputLayer)           [(None, 15000)]      0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           [(None, 3)]          0                                            
__________________________________________________________________________________________________
concatenate_22 (Concatenate)    (None, 15003)        0           input_34[0][0]                   
                                                                 input_35[0][0]                   
__________________________________________________________________________________________________
dense_44 (Dense)                (None, 256)          3841024     concatenate_22[0][0]      

In [0]:
train_embed = tokenize.texts_to_sequences(tweet_train)
test_embed = tokenize.texts_to_sequences(tweet_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen=max_seq_length, padding="post")

In [428]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
deep_model.summary()

Model: "model_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_36 (InputLayer)        [(None, 170)]             0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 170, 8)            120000    
_________________________________________________________________
flatten_11 (Flatten)         (None, 1360)              0         
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 1361      
Total params: 121,361
Trainable params: 121,361
Non-trainable params: 0
_________________________________________________________________


In [0]:
deep_model.compile(loss=loss, optimizer=optimizer,  metrics=['accuracy'])

In [430]:
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
combined_model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
combined_model.summary()

Model: "model_35"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_34 (InputLayer)           [(None, 15000)]      0                                            
__________________________________________________________________________________________________
input_35 (InputLayer)           [(None, 3)]          0                                            
__________________________________________________________________________________________________
input_36 (InputLayer)           [(None, 170)]        0                                            
__________________________________________________________________________________________________
concatenate_22 (Concatenate)    (None, 15003)        0           input_34[0][0]                   
                                                                 input_35[0][0]            

### Train the model

In [0]:
epochs = 10
batch_size = 128

In [432]:
history = combined_model.fit([bow_train, class_train] + [train_embed], labels_train, epochs=epochs, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [433]:
combined_model.evaluate([bow_test, class_test] + [test_embed], labels_test, batch_size=batch_size)



[0.002103975275531411, 0.8345440030097961]

In [0]:
predictions = combined_model.predict([bow_test, class_test] + [test_embed])

In [435]:
num_predictions = 40
diff = 0

for i in range(num_predictions):
    val = predictions[i]
    print(tweet_test.iloc[i])
    print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

&#8220;@Dedicated_03: Wassup twitter &#128518; http://t.co/rNeTkFXm7f&#8221; wassup bitch !!
Predicted:  0.9822972 Actual:  1 

@IIXIXIII bruhhh I'm joking... Don't pop trunk on me I'm just a bitch ass niiiieeegggaaa
Predicted:  0.9659867 Actual:  1 

@tyg235 @italian_montana have fun losing tonight bitch
Predicted:  0.9666593 Actual:  1 

RT @misstannaebaby: Niggas all about sex now days. Unlike you bitches, i jus want somebody who gone better me as a young women&#58400;
Predicted:  1.0066032 Actual:  1 

American culture is stuck to the side of the toilet. Graffiti writers in Europe can create better art without any lowlife trash sex jokes.
Predicted:  1.9066012 Actual:  2 

Lmao!! RT @CarmelIoAnthony: NOBODY cleans a house FASTER than a nigga expecting some pussy.
Predicted:  1.0007035 Actual:  1 

Don't bring dat police ass nicca round me
Predicted:  1.008824 Actual:  1 

&#8220;@DevJayAintShit: these hoes ain't loyal &#128557;  https://t.co/hiMKk7fYxB&#8221;&#128557;
Predicted:  0

In [436]:
print('Average prediction difference: ', diff / num_predictions)

Average prediction difference:  0.022747845947742464


### Generate TF model

In [437]:
saved_model_dir = '/content/profanity_check_tuning'
tf.saved_model.save(combined_model, saved_model_dir)

INFO:tensorflow:Assets written to: /content/profanity_check_tuning/assets


INFO:tensorflow:Assets written to: /content/profanity_check_tuning/assets
