In [None]:
!pip install tensorflow-text

In [None]:
!pip install nltk

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
!pip install contractions

In [None]:
# General defination of preprocessing for test data
import unicodedata
import string
import textblob
import nltk
import contractions
import re

# remove some stopwords to capture negation in n-grams if possible
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')

# load up a simple porter stemmer - nothing fancy
ps = nltk.porter.PorterStemmer()

class Preprocess():
    def remove_accented_chars(text):
        a = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return a

    def simple_text_preprocessor(document): 
        # lower case
        document = str(document).lower()
        
        # expand contractions
        document = contractions.fix(document)
        
        # remove unnecessary characters
        #^A-Za-z0-9 
        #^a-zA-Z
        document = re.sub(r'[^a-zA-z.,!?/:;\"\'\s]',r' ',document)
        document = re.sub(r'nbsp', r'', document)
        document = re.sub(' +', ' ', document)
        document = re.sub('!', ' ', document)
        document = re.sub('@', ' ', document)
        document = re.sub(':', ' ', document)
        document = re.sub('_', ' ', document)
        
        # simple porter stemming
        document = ' '.join([ps.stem(word) for word in document.split()])
        
        # stopwords removal
        document = ' '.join([word for word in document.split() if word not in stop_words])
        
        return document

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
import pandas as pd

df = pd.read_csv("/content/data.csv")
df.head(5)

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [None]:
import numpy as np

In [None]:
a = np.array(df['tweet'])
li =[]
li2=[]
for i in a:
    li.append(Preprocess.remove_accented_chars(i))

for i in li:
    li2.append(Preprocess.simple_text_preprocessor(i))

df['tweet']=[x for x in li2]
df.head(5)

Unnamed: 0,class,tweet
0,2,rt mayasolov woman not complain clean house. a...
1,1,rt mleew boy dat cold...tyga dwn bad cuffin da...
2,1,rt urkindofbrand dawg rt sbabi life ever fuck ...
3,1,rt c g anderson viva base look like tranni
4,1,rt shenikarobert shit hear might true might fa...


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['tweet'],df['class'], stratify=df['class'])

In [None]:
X_train.head(4)

17612       rt thatirvn satisfi x fuck bitch, said was, wa
4214              ng thirsti giant alway like "ho, ho, ho"
4730     sonniejohnson wedlock birth lie? latino nd pla...
2503     autyaut miss eat speak miss red bone cook ; ; ...
Name: tweet, dtype: object

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351724, -0.51327276, -0.88845724, ..., -0.74748844,
        -0.7531474 ,  0.91964495],
       [-0.87208337, -0.5054394 , -0.94446665, ..., -0.85847497,
        -0.71745324,  0.8808295 ]], dtype=float32)>

In [None]:
e = get_sentence_embeding([
    "banana", 
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[0]],[e[1]])

array([[0.9911088]], dtype=float32)

In [None]:
cosine_similarity([e[0]],[e[3]])

array([[0.84703803]], dtype=float32)

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [None]:
len(X_train)

18585

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3a6dbb9dc0>

In [None]:
model.evaluate(X_test, y_test)



[0.0, 0.7703356742858887, 0.9422827363014221, 0.99537593126297]

In [None]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()



In [None]:
reviews = [
    'what an idiot',
    'Jackies a retard At least I can make a grilled cheese!d',
    'it is shit you are Fuuck',
]
model.predict(reviews)