In [1]:
import numpy as np
import tensorflow as tf

In [2]:
import tensorflow_hub as hub

In [3]:
import tensorflow_text as text

In [4]:
import pandas as pd

df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [7]:
747/4825

0.15481865284974095

In [36]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.sample(5)

Unnamed: 0,Category,Message,spam
3523,ham,Im sorry bout last nite it wasnt ur fault it ...,0
122,ham,here is my new address -apples&pairs&all that ...,0
877,ham,Are you in castor? You need to see something,0
1301,ham,Those cocksuckers. If it makes you feel better...,0
4199,spam,Want to funk up ur fone with a weekly new tone...,1


In [37]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 3)

In [38]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 3)

In [41]:
from sklearn.utils import resample

In [42]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 3)

In [46]:
df_spam.shape[0]

747

In [51]:
spam_upsample = resample(df_spam,
             replace=True,
             n_samples= 4825,
             random_state=42)

print(spam_upsample.shape)

(4825, 3)


In [52]:
spam_upsample

Unnamed: 0,Category,Message,spam
713,spam,08714712388 between 10am-7pm Cost 10p,1
3230,spam,Ur cash-balance is currently 500 pounds - to m...,1
1929,spam,Call from 08702490080 - tells u 2 call 0906635...,1
738,spam,Hi. Customer Loyalty Offer:The NEW Nokia6650 M...,1
505,spam,#ERROR!,1
...,...,...,...
4248,spam,Text PASS to 69669 to collect your polyphonic ...,1
3675,spam,You have won a Nokia 7250i. This is what you g...,1
3620,spam,8007 25p 4 Alfie Moon's Children in Need song ...,1
3501,spam,Dorothy@kiefer.com (Bank of Granite issues Str...,1


In [53]:
df_balanced = pd.concat([df_ham, spam_upsample])
df_balanced.shape

(9650, 3)

In [54]:
df_balanced['Category'].value_counts()

spam    4825
ham     4825
Name: Category, dtype: int64

In [55]:
df_balanced['spam']=df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
1299,ham,Your daily text from me – a favour this time,0
517,spam,Boltblue tones for 150p Reply POLY# or MONO# e...,1
4169,spam,Congrats! Nokia 3650 video camera phone is you...,1
2823,spam,ROMCAPspam Everyone around should be respondin...,1
3787,ham,Wat r u doing?,0


In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['Message'],df_balanced['spam'], stratify=df_balanced['spam'])
X_train.head()

2583    3 FREE TAROT TEXTS! Find out about your love l...
5323                           Aah bless! How's your arm?
2632    URGENT! Your mobile No 077xxx WON a £2,000 Bon...
5001    Well its not like you actually called someone ...
3750    Dear Voucher Holder 2 claim your 1st class air...
Name: Message, dtype: object

# Import BERT model and get embeding vectors for few sample statements

In [57]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [58]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [59]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [60]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [61]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2b68a3c0580>

In [62]:
model.evaluate(X_test, y_test)



[0.17401160299777985,
 0.9519270658493042,
 0.9438567757606506,
 0.9610604643821716]

In [63]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()



In [64]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([1, 0, 0, ..., 1, 1, 1])

In [65]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_predicted)
cm

array([[1137,   69],
       [  47, 1160]], dtype=int64)

In [66]:
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted

array([1, 0, 0, ..., 1, 1, 1])