In [1]:
#import the libraries needed
import numpy as np      #used for algebra
import pandas as pd     #used for reading csv files
import nltk             #used to apply statistics to language
import tensorflow as tf #used for machine learning

In [2]:
#here we call the csv files that we will use
datatrain = pd.read_csv('spam.csv',encoding='latin-1')
datatest = pd.read_csv('output_spam.csv')
#we drop the columns of the csv file that are empty
datatrain = datatrain.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
datatrain = datatrain.rename(columns={"v1":'classification', "v2":'message'})
#these are used to change the labels in the csv file
tags = datatrain["classification"]
texts = datatrain["message"]

print(datatrain) #this shows us what is inside the dataset

     classification                                            message
0               ham  Go until jurong point, crazy.. Available only ...
1               ham                      Ok lar... Joking wif u oni...
2              spam  Free entry in 2 a wkly comp to win FA Cup fina...
3               ham  U dun say so early hor... U c already then say...
4               ham  Nah I don't think he goes to usf, he lives aro...
...             ...                                                ...
5567           spam  This is the 2nd time we have tried 2 contact u...
5568            ham              Will ÃŒ_ b going to esplanade fr home?
5569            ham  Pity, * was in mood for that. So...any other s...
5570            ham  The guy did some bitching but I acted like i'd...
5571            ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [3]:
datatrain.describe() #this shows us the number of words inside the dataset

Unnamed: 0,classification,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
dupe = datatrain[datatrain.duplicated()] #this check for duplicates in the dataset
print(dupe)

     classification                                            message
102             ham  As per your request 'Melle Melle (Oru Minnamin...
153             ham  As per your request 'Melle Melle (Oru Minnamin...
206             ham  As I entered my cabin my PA said, '' Happy B'd...
222             ham                             Sorry, I'll call later
325             ham                   No calls..messages..missed calls
...             ...                                                ...
5524           spam  You are awarded a SiPix Digital Camera! call 0...
5535            ham  I know you are thinkin malaria. But relax, chi...
5539            ham                         Just sleeping..and surfing
5553            ham                        Hahaha..use your brain dear
5558            ham                             Sorry, I'll call later

[403 rows x 2 columns]


In [5]:
datatrain.drop_duplicates(inplace = True) #this is used to drop the duplicates in the dataset
datatrain.describe() #use to check the count again and as you can see it reduced since we dropped the duplicates

Unnamed: 0,classification,message
count,5169,5169
unique,2,5169
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


In [6]:
datatrain.isnull().sum() #used to check for empty/null cells in the dataset

classification    0
message           0
dtype: int64

In [7]:
datatrain.shape #used to check the array shape/size of the dataset

(5169, 2)

In [8]:
from wordcloud import WordCloud, STOPWORDS #library used to store words and determine the characteristics
#used to segregate the words based on their classification
hams = datatrain[datatrain.classification=="ham"]
spams = datatrain[datatrain.classification=="spam"]
#this turns the segregated words into a numpy array to be used later
hams_new = " ".join(hams.message.to_numpy().tolist())
spams_new = " ".join(spams.message.to_numpy().tolist())

In [9]:
#wordcloud of ham messages
hams_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(hams_new)
#wordcloud of spam messages
spams_cloud = WordCloud(width =520, height =260, stopwords=STOPWORDS,max_font_size=50, background_color ="black", colormap='Blues').generate(spams_new)

In [10]:
#used to change the length and make it so that both ham and spam have the same length
hams_msg = hams.sample(n = len(spams), random_state = 44)
spams_msg = spams
cm = hams_msg.append(spams_msg).reset_index(drop=True)

  cm = hams_msg.append(spams_msg).reset_index(drop=True)


hams_msg = hams.sample(n=len(spams), random_state = 44)
spams_msg = spams
cm = hams_msg.append(spams_msg).reset_index(drop=True)

In [11]:
#this is used to change the value of ham to 0 and spam to 1 so that the values will be integers and not string
cm["msg_class"] = cm["classification"].map({"ham": 0, "spam": 1})
cm_class = cm["msg_class"].values

In [12]:
#the train-test split library and used to split the dataset into the test and train variables
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cm["message"], cm_class, test_size=0.2, random_state=None)

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer #library used to change the strings into tokens
from tensorflow.keras.preprocessing.sequence import pad_sequences #library used to format the string
#this converts the words into individual tokens and stored in an index
token = Tokenizer(num_words=500, char_level=False, oov_token="<OOV>")
token.fit_on_texts(X_train)
index = token.word_index

In [14]:
#training sequencing and padding, a safe way of multiprocessing the data in the dataset
train_sequence = token.texts_to_sequences(X_train)
train_pad = pad_sequences(train_sequence, maxlen=50, padding="post", truncating="post")
#testing sequencing and padding, a safe way of multiprocessing the data in the dataset
test_sequence = token.texts_to_sequences(X_test)
test_pad = pad_sequences(test_sequence, maxlen=50,padding="post", truncating="post")

In [15]:
from tensorflow.keras.models import Sequential #library used to import the model to be used
from tensorflow.keras.layers import Flatten, Dense, Activation, Dropout, Embedding, GlobalAveragePooling1D #library used to import the layers of the neural network and the other needed function such as Flatten and Activation
#this is the model of the neural network starting from the input layer upto the output layer
model = Sequential()
model.add(Embedding(500, 16, input_length=50)) #captures the semantics of the input and makes it easier to do machine learning
model.add(GlobalAveragePooling1D()) #Functions the same as Flatten, however, GlobalAveragePooling is better with larger variables
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
#the model compiler used to finalize the model and get it ready for use in fitting and predicting
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(train_pad, y_train, epochs=12, validation_data=(test_pad, y_test)) #model fitting is the training and validation of the data

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x1a396290a60>

In [16]:
test_msg = datatest.iloc[:,1] #calling the test dataset
def predict_spam(test_msg): #defining the spam prediction function
    new_seq = token.texts_to_sequences(test_msg) #creating a token for the prediction function
    padded = pad_sequences(new_seq, maxlen=50, padding="post", truncating="post") #just like in the training variables, the prediction variables need to be padded as well
    return (model.predict(padded)) #returns the predicted value

prediction_final = predict_spam(test_msg) #predicts the input text, in this case, the test dataset



In [17]:
#this is used to round off the float values into 0 and 1 int values to make the data easier to understand, 1 for spam, 0 for ham
pred = np.round_(prediction_final, decimals = 0, out = None)
pred

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)