# Connect to google drive

We do this step for read the data we have

In [1]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


## Process the data

In [2]:
import pandas as pd
df = pd.read_csv("/content/gdrive/MyDrive/SMS Classifier/spam.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df = df[['v1', 'v2']]

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['v1'].value_counts()

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,4825
spam,747


In [5]:
# check null data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### Change the label from text to numbers

we need to do it because neural networks can't understand text

In [6]:
#make label looks like label encoding

mapping_category = {
    'ham' : 0,
    'spam' : 1
}

df['v1'] = df['v1'].apply(lambda x: mapping_category.get(x, -1))

df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Remove punctuation from text

In [7]:
## clean data

import string
import nltk

# make sure to download  nltk punkt
nltk.download('punkt')

# remove duplicate data
df = df.drop_duplicates(keep='first')

# Function to remove punuacttion
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to the 'text' column
#to remove punctuation (!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`.)
df['v2'] = df['v2'].apply(remove_punctuation)

# lowercase
df['v2'] = df['v2'].str.lower()

# Display the DataFrame
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v2'] = df['v2'].apply(remove_punctuation)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['v2'] = df['v2'].str.lower()


Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


### Just for fun, I do some "analyze" the data

#### Get how many unique words on our data

In [8]:
## to find unique word in dataset

import pandas as pd
from nltk.tokenize import word_tokenize
from collections import Counter

# Combine all sentences into one long text
all_text = ' '.join(df['v2'])

#Tokenize long text into words
words = word_tokenize(all_text.lower())

# Count the number of unique words
unique_words = set(words)
total_unique_words = len(unique_words)

print(f"Total unique words: {total_unique_words}")

Total unique words: 9558


#### Get the longest word in our data

In [9]:
# Find the longest word
longest_word = max(words, key=len)

print(longest_word)

88039skilgmetscs087147403231winawkage16å£150perwksub


#### Get the longest SMS in the Data

In [10]:
# Function to count the number of words in a sentence
def count_words(sentence):
    return len(word_tokenize(sentence))

#Initialize variable for longest sentence
longest_sentence = ""
max_word_count = 0

#Loop through each row in a DataFrame
for sentence in df['v2']:
#Tokenize sentences into words and count the number of words
    word_count = count_words(sentence)
    # If the number of words in this sentence is greater than the longest sentence
    if word_count > max_word_count:
        longest_sentence = sentence
        max_word_count = word_count

print(f"The longest sentence is: \"{longest_sentence}\"")
print(f"Number of words in the longest sentence: {max_word_count}")

The longest sentence is: "for me the love should start with attractioni should feel that i need her every time around meshe should be the first thing which comes in my thoughtsi would start the day and end it with hershe should be there every time i dreamlove will be then when my every breath has her namemy life should happen around hermy life will be named to heri would cry for herwill give all my happiness and take all her sorrowsi will be ready to fight with anyone for heri will be in love when i will be doing the craziest things for herlove will be when i dont have to proove anyone that my girl is the most beautiful lady on the whole planeti will always be singing praises for herlove will be when i start up making chicken curry and end up makiing sambarlife will be the most beautiful thenwill get every morning and thank god for the day because she is with mei would like to say a lotwill tell later"
Number of words in the longest sentence: 171


## Prepare the model

### Tokenize our data

I use my own tokenizer, you can use any pre-trained tokenizer like Glove for doing this, skip this session if you want to see how to use GLOVE for this task

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Tokenize text data
max_words_name = 10000  # Maximum number of words to tokenize for name column
max_len_name = 200  # Maximum length of sequences for name column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=42)

# Tokenize name for training
name_tokenizer = Tokenizer(num_words=max_words_name, oov_token='<OOV>')
name_tokenizer.fit_on_texts(X_train) ## this one change into the name of train data
name_sequences_train = name_tokenizer.texts_to_sequences(X_train) # this one too
name_padded_train = pad_sequences(name_sequences_train, maxlen=max_len_name, padding='post') # ini data buat training nanti

# Tokenize data for testing
name_sequences_test = name_tokenizer.texts_to_sequences(X_test)
name_padded_test = pad_sequences(name_sequences_test, maxlen=max_len_name, padding='post')

# need it if you want to retrain the model with glove (read documentation)
word_index = name_tokenizer.word_index
VOCAB_SIZE = len(word_index)

# Display tokenized data
# print("Tokenized Name Sequences:")
# print(name_padded)

In [12]:
## See the tokenize data
print(name_padded_train[0])

[ 426  151   12 1427 3323    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

#### With training own text embedding.

##### Create the model and train it

because our data is good, I only use 1 layer bidirectional LSTM, if you want to use 2 layer of it, you can change it into :

lstm_output_1 = Bidirectional(LSTM(64, return_sequences=True))(desc_embedding)

lstm_output_2 = Bidirectional(LSTM(32))(lstm_output_2)

and in this case we only have 2 label so in the compile we define the loss as 'binary_crossentropy', if you data have more than 2 label, change it into 'categorical_crossentropy'

In [26]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate

# Define input layer for description
desc_input = Input(shape=(max_len_name,), name='desc_input')

# Define embedding layer for description
desc_embedding = Embedding(input_dim=max_words_name, output_dim=64, input_length=max_len_name)(desc_input)

# Bidirectional LSTM layer
lstm_output = Bidirectional(LSTM(64))(desc_embedding)

# Output layer
output = Dense(1, activation='sigmoid')(lstm_output)

# Create the model
model = Model(inputs=desc_input, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(name_padded_train, y_train, epochs=10, batch_size=32, validation_data=(name_padded_test, y_test))

Epoch 1/10




[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.9018 - loss: 0.3109 - val_accuracy: 0.9778 - val_loss: 0.0726
Epoch 2/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9859 - loss: 0.0554 - val_accuracy: 0.9807 - val_loss: 0.0573
Epoch 3/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9947 - loss: 0.0237 - val_accuracy: 0.9816 - val_loss: 0.0534
Epoch 4/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9976 - loss: 0.0075 - val_accuracy: 0.9729 - val_loss: 0.0847
Epoch 5/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.9988 - loss: 0.0051 - val_accuracy: 0.9845 - val_loss: 0.0512
Epoch 6/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9996 - loss: 0.0020 - val_accuracy: 0.9874 - val_loss: 0.0629
Epoch 7/10
[1m130/130[0m [32m━

<keras.src.callbacks.history.History at 0x7cedec193220>

##### Save our tokenizer, model architecture and weights

In [None]:
import pickle

# save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(name_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

model_json = model.to_json()
with open('model_architecture.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('model.weights.h5')

##### Test the model

In [28]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

max_len_desc = 200

# Load the model architecture from JSON file
with open('/content/gdrive/MyDrive/SMS Classifier/sms_classifier/model_architecture.json', 'r') as json_file:
    loaded_model_json = json_file.read()

# Load the model
loaded_model = model_from_json(loaded_model_json)

# Load the model weights
loaded_model.load_weights('/content/gdrive/MyDrive/SMS Classifier/sms_classifier/model.weights.h5')

new_text = ["Your number won 5000 dollar, go to our website now", "Keep your clothes clean from stains when you're cooking"]

# Load the saved tokenizer
with open('/content/gdrive/MyDrive/SMS Classifier/sms_classifier/tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

for text in new_text:
  # Tokenize the new text using the loaded tokenizer
  new_text_sequence = loaded_tokenizer.texts_to_sequences([text])
  new_text_padded = pad_sequences(new_text_sequence, maxlen=max_len_desc, padding='post')

  # mapping the predicted
  class_category = {
      0 : 'Ham',
      1 : 'Spam'
  }

  # Make predictions on the new text
  predictions = loaded_model.predict(new_text_padded)
  predicted_class = np.round(predictions).astype(int)[0][0]
  predicted_class = class_category[predicted_class]
  print(f'for input : {text} is have class : {predicted_class}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
for input : Your number won 5000 dollar, go to our website now is have class : Spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
for input : Keep your clothes clean from stains when you're cooking is have class : Ham


#### With using others (like GLOVE) text embedding.

on this case we will use GLOVE embedding, you can download it on https://nlp.stanford.edu/projects/glove/

In [15]:
import numpy as np

# define the path you save the GLOVE text embedding
GLOVE_FILE = '/content/gdrive/MyDrive/SMS Classifier/glove.6B.100d.txt'

# Initialize an empty embeddings index dictionary

GLOVE_EMBEDDINGS = {}

# Read the GloVe word embeddings from the text file and store them in the dictionary
with open(GLOVE_FILE) as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    GLOVE_EMBEDDINGS[word] = coefs

In [16]:
## test the embedding

test_word = 'human'

test_vector = GLOVE_EMBEDDINGS[test_word]

print(f"Vector representation of word {test_word} looks like this:\n\n{test_vector}")

Vector representation of word human looks like this:

[ 3.3864e-01  5.9663e-01  5.3322e-01  3.1404e-01  1.5321e-01  3.1749e-01
 -4.2940e-01 -2.9150e-01 -2.1047e-03 -3.9309e-01 -8.5441e-01 -8.0708e-02
  1.2118e+00  6.9316e-02  8.0613e-03  8.7888e-01  3.1908e-02  5.8655e-01
 -5.4892e-01 -7.8468e-03  1.7327e-01 -2.6693e-01  4.2802e-01  6.6123e-02
  5.1847e-01  7.7226e-01  2.0608e-01 -4.5836e-01  3.5485e-01  7.1547e-01
  6.0855e-01  2.0254e-01 -4.8756e-01  5.7974e-01  8.6728e-02 -5.1852e-01
 -3.7274e-01  1.0014e+00 -2.9259e-01  3.2290e-01 -9.7563e-01 -2.2288e-01
 -2.3335e-01 -2.6891e-01  1.4612e-01  1.2004e-01 -2.0402e-01 -9.4647e-02
 -1.5402e+00 -5.9510e-02  1.0887e+00 -2.4998e-01 -2.5808e-01  1.2798e+00
 -1.2849e-01 -1.4511e+00 -2.4686e-01 -9.5046e-02  1.7425e+00  1.1977e-01
 -1.9206e-01  4.4368e-01 -1.6453e-01 -7.6663e-01  1.1100e+00  4.6748e-01
 -2.4673e-02  4.7179e-03  6.9761e-01 -2.2975e-01  6.4385e-01 -8.9847e-02
  7.8711e-02 -4.1255e-02  5.3239e-01 -3.9945e-01 -4.6565e-01 -2.7601e-

In [17]:
print(f"Each word vector has shape: {test_vector.shape}")

Each word vector has shape: (100,)


In [18]:
EMBEDDING_DIM = 100

# Initialize an empty numpy array with the appropriate size
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))

# Iterate all of the words in the vocabulary and if the vector representation for
# each word exists within GloVe's representations, save it in the EMBEDDINGS_MATRIX array
for word, i in word_index.items():
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

In [20]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate

# Define input layer for description
desc_input = Input(shape=(max_len_name,), name='desc_input')

# Define embedding layer for description
## look the difference when we train our own embedding
desc_embedding = Embedding(input_dim=VOCAB_SIZE+1, output_dim=EMBEDDING_DIM, input_length=max_len_name, weights=[EMBEDDINGS_MATRIX], trainable=False)(desc_input)

# Bidirectional LSTM layer
lstm_output = Bidirectional(LSTM(64))(desc_embedding)

# Output layer
output = Dense(1, activation='sigmoid')(lstm_output)

# Create the model
model = Model(inputs=desc_input, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(name_padded_train, y_train, epochs=10, batch_size=32, validation_data=(name_padded_test, y_test))



Epoch 1/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.8476 - loss: 0.3535 - val_accuracy: 0.9584 - val_loss: 0.1263
Epoch 2/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9681 - loss: 0.1010 - val_accuracy: 0.9700 - val_loss: 0.0928
Epoch 3/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9746 - loss: 0.0785 - val_accuracy: 0.9691 - val_loss: 0.0890
Epoch 4/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9806 - loss: 0.0656 - val_accuracy: 0.9662 - val_loss: 0.0841
Epoch 5/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9850 - loss: 0.0551 - val_accuracy: 0.9623 - val_loss: 0.0993
Epoch 6/10
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9857 - loss: 0.0530 - val_accuracy: 0.9632 - val_loss: 0.1041
Epoch 7/10
[1m130/130

<keras.src.callbacks.history.History at 0x7cee636bbf70>

##### Save the model weights and model architecture.

on this case we dont need to save the tokenizer, because it we use same tokenizer.

In [22]:
# save the model architecture
model_json = model.to_json()
with open('model_architecture_glove.json', 'w') as json_file:
    json_file.write(model_json)

# Save the model weights
model.save_weights('model_glove.weights.h5')

##### Test the model

In [25]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import model_from_json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

max_len_desc = 200

# Load the model architecture from JSON file
with open('/content/model_architecture_glove.json', 'r') as json_file:
    loaded_model_json = json_file.read()

# Load the model
loaded_model = model_from_json(loaded_model_json)

# Load the model weights
loaded_model.load_weights('/content/model_glove.weights.h5')

new_text = ["Your number won 5000 dollar, go to our website now", "Keep your clothes clean from stains when you're cooking"]

# Load the saved tokenizer
with open('/content/gdrive/MyDrive/SMS Classifier/sms_classifier/tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

for text in new_text:
  # Tokenize the new text using the loaded tokenizer
  new_text_sequence = loaded_tokenizer.texts_to_sequences([text])
  new_text_padded = pad_sequences(new_text_sequence, maxlen=max_len_desc, padding='post')

  # mapping the predicted
  class_category = {
      0 : 'Ham',
      1 : 'Spam'
  }

  # Make predictions on the new text
  predictions = loaded_model.predict(new_text_padded)
  predicted_class = np.round(predictions).astype(int)[0][0]
  predicted_class = class_category[predicted_class]
  print(f'for input : {text} is have class : {predicted_class}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step
for input : Your number won 5000 dollar, go to our website now is have class : Spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
for input : Keep your clothes clean from stains when you're cooking is have class : Ham
