In [16]:
#Spam Classification using BERT

In [17]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [18]:
import pandas as pd
df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [20]:
#Using Downsampling to handle imbalanced dataset

In [21]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [22]:
df_spam = df[df['Category']=='spam']
df_spam.shape

(747, 2)

In [23]:
df_ham = df[df['Category']=='ham']
df_ham.shape

(4825, 2)

In [24]:
df_ham_downsampled =  df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape

(747, 2)

In [25]:
df_balanced = pd.concat([df_spam, df_ham_downsampled])

In [26]:
df_balanced.shape

(1494, 2)

In [27]:
df_balanced['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
spam,747
ham,747


In [28]:
df_balanced.sample(5)

Unnamed: 0,Category,Message
1105,spam,Message Important information for O2 user. Tod...
5549,ham,"You know, wot people wear. T shirts, jumpers, ..."
3092,spam,LORD OF THE RINGS:RETURN OF THE KING in store ...
4646,ham,Are you planning to come chennai?
924,ham,She went to attend another two rounds today..b...


In [29]:
df_balanced['spam'] = df_balanced['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [30]:

df_balanced = df_balanced[['Message', 'spam']]
df_balanced.head(5)

Unnamed: 0,Message,spam
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
5,FreeMsg Hey there darling it's been 3 week's n...,1
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1
11,"SIX chances to win CASH! From 100 to 20,000 po...",1


In [31]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_tesy = train_test_split(df_balanced['Message'], df_balanced['spam'], stratify=df_balanced['spam'])

In [32]:
x_train.head()

Unnamed: 0,Message
4903,* FREE* POLYPHONIC RINGTONE Text SUPER to 8713...
4367,Mm yes dear look how i am hugging you both. :-P
2742,I don't know u and u don't know me. Send CHAT ...
3316,FREE MESSAGE Activate your 500 FREE Text Messa...
4258,important information 4 orange user . today is...


In [33]:
x_train.shape

(1120,)

In [34]:
x_test.shape

(374,)

In [39]:
preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
bert_preprocess_model = hub.KerasLayer(preprocess_url)
bert_model = hub.KerasLayer(encoder_url)

In [40]:

text_test = ['nice movie indeed', 'I love python programming']
text_preprocessed = bert_preprocess_model(text_test)
text_preprocessed.keys()

dict_keys(['input_type_ids', 'input_mask', 'input_word_ids'])

In [41]:

def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up",
    "Bhavin"])


<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.8435169 , -0.5132726 , -0.8884571 , ..., -0.74748844,
        -0.7531473 ,  0.91964495],
       [-0.73148924, -0.24430393,  0.39650956, ...,  0.37817872,
        -0.491372  ,  0.677023  ]], dtype=float32)>

In [42]:
e = get_sentence_embeding([
    "banana",
    "grapes",
    "mango",
    "jeff bezos",
    "elon musk",
    "bill gates"
]
)

In [None]:
def build_model(x_train):
  preprocessed_text = bert_preprocess(x_train)
  outputs = bert_encoder(preprocessed_text)
  l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
  l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
  model = tf.keras.Model(inputs=[x_train], outputs = [l])
  return model

In [None]:
model = build_model(x_train)