In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [3]:
names = ["class", "message"]

In [4]:
train_file = pd.read_csv(train_file_path, sep='\t', names=names) #name use when there is no column name or no column define
train_file

Unnamed: 0,class,message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...
...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...


In [5]:
test_file = pd.read_csv(test_file_path, sep='\t', names=names)
test_file

Unnamed: 0,class,message
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...
3,ham,don‘t give a flying monkeys wot they think and...
4,ham,who are you seeing?
...,...,...
1387,ham,true dear..i sat to pray evening and felt so.s...
1388,ham,"what will we do in the shower, baby?"
1389,ham,where are you ? what are you doing ? are yuou ...
1390,spam,ur cash-balance is currently 500 pounds - to m...


In [6]:
# 0 represent not spam(ham) and 1 represent spam
train_message = train_file["message"].values.tolist()
train_label = np.array([0 if x=="ham" else 1 for x in train_file['class'].values.tolist()])
test_message = test_file["message"].values.tolist()
test_label = np.array([0 if x=="ham" else 1 for x in test_file['class'].values.tolist()])

In [7]:
train_message[0:5]

['ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it.',
 'you can never do nothing',
 'now u sound like manky scouse boy steve,like! i is travelling on da bus home.wot has u inmind 4 recreation dis eve?',
 'mum say we wan to go then go... then she can shun bian watch da glass exhibition...',
 'never y lei... i v lazy... got wat? dat day ü send me da url cant work one...']

In [8]:
train_label

array([0, 0, 0, ..., 1, 1, 0])

In [9]:
#making vocabulary dictionary
vocabulary_dict = {}
for messgae in train_message:
  for vocabulary in messgae.split():
    if vocabulary not in vocabulary_dict:
      vocabulary_dict[vocabulary] = 1
    else:
      vocabulary_dict[vocabulary] += 1

In [10]:
vocabulary_dict

{'ahhhh...just': 1,
 'woken': 2,
 'up!had': 1,
 'a': 1052,
 'bad': 18,
 'dream': 9,
 'about': 122,
 'u': 729,
 'tho,so': 1,
 'i': 1633,
 'dont': 108,
 'like': 182,
 'right': 47,
 'now': 181,
 ':)': 43,
 'didnt': 20,
 'know': 176,
 'anything': 47,
 'comedy': 3,
 'night': 64,
 'but': 325,
 'guess': 26,
 'im': 63,
 'up': 199,
 'for': 536,
 'it.': 42,
 'you': 1451,
 'can': 295,
 'never': 35,
 'do': 290,
 'nothing': 20,
 'sound': 6,
 'manky': 1,
 'scouse': 1,
 'boy': 17,
 'steve,like!': 1,
 'is': 643,
 'travelling': 1,
 'on': 400,
 'da': 59,
 'bus': 18,
 'home.wot': 1,
 'has': 95,
 'inmind': 1,
 '4': 194,
 'recreation': 1,
 'dis': 21,
 'eve?': 3,
 'mum': 9,
 'say': 65,
 'we': 258,
 'wan': 35,
 'to': 1670,
 'go': 213,
 'then': 159,
 'go...': 1,
 'she': 108,
 'shun': 1,
 'bian': 1,
 'watch': 25,
 'glass': 1,
 'exhibition...': 1,
 'y': 27,
 'lei...': 9,
 'v': 27,
 'lazy...': 1,
 'got': 181,
 'wat?': 4,
 'dat': 26,
 'day': 93,
 'ü': 121,
 'send': 139,
 'me': 455,
 'url': 1,
 'cant': 48,
 'work'

In [11]:
VOCAB_SIZE = len(vocabulary_dict)
MAX_LENGTH = len(max(train_message, key=lambda p: len(p.split())).split()) #find lengths of maximum length message

In [12]:
VOCAB_SIZE

11330

In [13]:
MAX_LENGTH

171

In [14]:
#one hot encodding to messages and apply post padding
encoded_train_message = [one_hot(d, VOCAB_SIZE) for d in train_message]
padded_train_message = pad_sequences(encoded_train_message, maxlen=MAX_LENGTH, padding='post')
encoded_test_message = [one_hot(d, VOCAB_SIZE) for d in test_message]
padded_test_message = pad_sequences(encoded_test_message, maxlen=MAX_LENGTH, padding='post')

In [15]:
encoded_train_message[1]

[8516, 4413, 3382, 5056, 1212]

In [16]:
# Create a Sequential model
model = Sequential()

# Create an embedding layer that converts integer-encoded word indices into dense vectors
embedding_layer = Embedding(VOCAB_SIZE, 100, input_length=MAX_LENGTH)
model.add(embedding_layer)  # Add the embedding layer to the model

# Flatten the 2D embedded representation into a 1D array
model.add(Flatten())

# Add a fully connected Dense layer with ReLU activation
model.add(Dense(256, activation='relu'))  # New dense layer

# Add a fully connected Dense layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model with appropriate optimizer, loss function, and metrics
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

# Define an EarlyStopping callback to prevent overfitting and monitor validation accuracy
monitor = EarlyStopping(monitor='val_acc', patience=7, verbose=1, mode='max', restore_best_weights=True)

# Train the model on the training data, validating on the test data
model.fit(padded_train_message, train_label, validation_data=(padded_test_message, test_label),
          callbacks=[monitor], epochs=50, verbose=2)

Metal device set to: Apple M1
Epoch 1/50


2023-08-22 10:21:21.797193: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


131/131 - 7s - loss: 0.1470 - acc: 0.9495 - val_loss: 0.0557 - val_acc: 0.9806 - 7s/epoch - 53ms/step
Epoch 2/50
131/131 - 5s - loss: 0.0241 - acc: 0.9928 - val_loss: 0.0421 - val_acc: 0.9871 - 5s/epoch - 35ms/step
Epoch 3/50
131/131 - 4s - loss: 0.0055 - acc: 0.9990 - val_loss: 0.0417 - val_acc: 0.9864 - 4s/epoch - 32ms/step
Epoch 4/50
131/131 - 4s - loss: 0.0028 - acc: 0.9998 - val_loss: 0.0464 - val_acc: 0.9856 - 4s/epoch - 31ms/step
Epoch 5/50
131/131 - 4s - loss: 0.0015 - acc: 0.9998 - val_loss: 0.0426 - val_acc: 0.9871 - 4s/epoch - 31ms/step
Epoch 6/50
131/131 - 4s - loss: 7.6547e-04 - acc: 0.9998 - val_loss: 0.0479 - val_acc: 0.9856 - 4s/epoch - 31ms/step
Epoch 7/50
131/131 - 4s - loss: 3.1548e-04 - acc: 1.0000 - val_loss: 0.0418 - val_acc: 0.9864 - 4s/epoch - 34ms/step
Epoch 8/50
131/131 - 4s - loss: 1.7416e-04 - acc: 1.0000 - val_loss: 0.0436 - val_acc: 0.9864 - 4s/epoch - 31ms/step
Epoch 9/50
Restoring model weights from the end of the best epoch: 2.
131/131 - 4s - loss: 1.28

<keras.callbacks.History at 0x15b548550>

In [17]:
def predict_message(pred_text):
  class_dict = {
      0 : "ham",
      1 : "spam",
      }
  encoded_message = [one_hot(pred_text, VOCAB_SIZE)]
  padded_message = pad_sequences(encoded_message, maxlen=MAX_LENGTH, padding='post')
  prediction = [model.predict(padded_message)[0][0], class_dict[np.round(model.predict(padded_message)[0][0])]]
  return prediction

pred_text = "Thank you, for looking my github code"

prediction = predict_message(pred_text)
print(prediction)

[0.004031819, 'ham']


In [20]:
test_messages = ["how are you doing today",
                   "Congratulations! You've won a $1000 gift card. Claim now by clicking the link below:http://example.com/claim-gift-card",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]
test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]

In [21]:
for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print("Message: ",msg,"\n Actual Answer :",ans,"\n Prediction: ",prediction[1])
    

Message:  how are you doing today 
 Actual Answer : ham 
 Prediction:  ham
Message:  Congratulations! You've won a $1000 gift card. Claim now by clicking the link below:http://example.com/claim-gift-card 
 Actual Answer : spam 
 Prediction:  spam
Message:  i dont want to go. can we try it a different day? available sat 
 Actual Answer : ham 
 Prediction:  ham
Message:  our new mobile video service is live. just install on your phone to start watching. 
 Actual Answer : spam 
 Prediction:  spam
Message:  you have won £1000 cash! call to claim your prize. 
 Actual Answer : spam 
 Prediction:  spam
Message:  i'll bring it tomorrow. don't forget the milk. 
 Actual Answer : ham 
 Prediction:  ham
Message:  wow, is your arm alright. that happened to me one time too 
 Actual Answer : ham 
 Prediction:  ham
