In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow_text as text
import tensorflow as tf
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.optimizers import Adam

In [2]:
data = pd.read_csv("../data/spam_data.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [4]:
data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
747/4825

0.15481865284974095

In [6]:
data_spam = data[data['Category'] == 'spam']
data_spam.shape

(747, 2)

In [7]:
data_ham = data[data['Category'] == 'ham']
data_ham.shape

(4825, 2)

In [8]:
data_ham_downsampled = data_ham.sample(data_spam.shape[0])
data_ham_downsampled.shape

(747, 2)

In [9]:
data_balanced = pd.concat([data_ham_downsampled, data_spam])
data_balanced.shape

(1494, 2)

In [10]:
data_balanced['Category'].value_counts()

ham     747
spam    747
Name: Category, dtype: int64

In [11]:
data_balanced['spam'] = data_balanced['Category'].apply(lambda x : 1 if x=='spam' else 0)
data_balanced.sample(5)

Unnamed: 0,Category,Message,spam
876,spam,"Shop till u Drop, IS IT YOU, either 10K, 5K, £...",1
1333,ham,It's ok lar. U sleep early too... Nite...,0
311,ham,Today is ACCEPT DAY..U Accept me as? Brother S...,0
5053,ham,"Tick, tick, tick .... Where are you ? I could ...",0
2987,spam,Reply to win £100 weekly! What professional sp...,1


In [17]:
X = data_balanced[['Message']]
y = data_balanced['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [18]:
print(f"Train shape : {X_train.shape} {y_train.shape}")
print(f"Test shape : {X_test.shape} {y_test.shape}")


Train shape : (1195, 1) (1195,)
Test shape : (299, 1) (299,)


In [47]:
#Load preprocess and encode Bert
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [49]:
#Create Model
text_input = Input(shape=(), dtype=tf.string)
preprocessor = bert_preprocess(text_input)
encoder = bert_encoder(preprocessor)
dropout = Dropout(0.1)(encoder['pooled_output'])
output = Dense(1, activation="sigmoid")(dropout)

model = Model(inputs=text_input, outputs=output)

model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(None,)]            0           []                               
                                                                                                  
 keras_layer_5 (KerasLayer)     {'input_type_ids':   0           ['input_19[0][0]']               
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [54]:
#Compile model
model.compile(loss='binary_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [55]:
#Train model
history = model.fit(X_train, y_train, epochs=1)



KeyboardInterrupt: 