In [1]:
# You may need to install libraries
! pip install pandas
! pip install nltk
! pip install scikit-learn



In [2]:
# Import libraries
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [3]:
# Read the dataset
pd.set_option('display.max_colwidth', None)
messages = pd.read_csv(
    "spam.csv", encoding="latin-1", header = 0, usecols=['v1','v2'])
messages = messages.rename(columns={'v1':'label','v2':'message'})
messages.head(3)
# encoding converts bytes to character in latin

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [4]:
# Next, we define a text_preprocess method that removes punctuations, stop-words, and non-alphabets.

def text_preprocess(message):
    # Remove punctuations
    nopunc = [char for char in message if char not in string.punctuation]

    # Join the characters again
    nopunc = "".join(nopunc)
    nopunc = nopunc.lower()

    # Remove any stopwords and non-alphabetic characters
    nostop = [
        word
        for word in nopunc.split()
        if word.lower() not in stopwords.words("english") and word.isalpha()
    ]

    return nostop

In [5]:
# Let's see how many spam and ham (non-spam) messages constitute our dataset.
spam_messages = messages[messages["label"] == "spam"]["message"]
ham_messages = messages[messages["label"] == "ham"]["message"]
print(f"Number of spam messages: {len(spam_messages)}")
print(f"Number of ham messages: {len(ham_messages)}")

Number of spam messages: 747
Number of ham messages: 4825


In [6]:
# Next, we check the top ten words that repeat the most in spam messages.
# Download stopwords
nltk.download('stopwords')

# Words in spam messages
spam_words = []
for each_message in spam_messages:
    spam_words += text_preprocess(each_message)
    
print(f"Top 10 spam words are:\n {pd.Series(spam_words).value_counts().head(10)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vista\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 10 spam words are:
 call      347
free      216
txt       150
u         147
ur        144
mobile    123
text      120
claim     113
stop      113
reply     101
Name: count, dtype: int64


In [7]:
# Next, we check the top ten words that repeat the most in ham messages.
# Words in ham messages
ham_words = []
for each_message in ham_messages:
    ham_words += text_preprocess(each_message)
    
print(f"Top 10 ham words are:\n {pd.Series(ham_words).value_counts().head(10)}")

Top 10 ham words are:
 u       972
im      449
get     303
ltgt    276
ok      272
dont    257
go      247
ur      240
ill     236
know    232
Name: count, dtype: int64


This information isn't needed to conduct our modeling; however, it is critical to perform exploratory data analysis to help inform our model.

Here comes the crucial step: we text_preprocess our messages.

In [9]:
# Remove punctuations/stopwords from all messages
messages["message"] = messages["message"].apply(text_preprocess)
messages.head()

Unnamed: 0,label,message
0,ham,"[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, wkly, comp, win, fa, cup, final, tkts, may, text, fa, receive, entry, questionstd, txt, ratetcs, apply]"
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, goes, usf, lives, around, though]"


The output produced will be a list of tokens. A string can be understood by a model, not a list of tokens. Hence, we convert the list of tokens to a string.

In [11]:
# Convert messages (as lists of string tokens) to strings
messages["message"] = messages["message"].transform(lambda x: " ".join(map(str, x)))
messages.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may text fa receive entry questionstd txt ratetcs apply
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


The CountVectorizer() class in the scikit-learn library is useful in defining the BoW approach. We first fit the vectorizer to the messages to fetch the whole vocabulary.

In [13]:
# Initialize count vectorizer
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(messages["message"])

In [14]:
# get the feature names
features = vectorizer.get_feature_names_out()

In [15]:
# Fetch the vocabulary set
print(f"30 BOW Features: {features[0:30]}")
print(f"Total number of vocab words: {len(vectorizer.vocabulary_)}")

30 BOW Features: ['aa' 'aah' 'aaniye' 'aaooooright' 'aathilove' 'aathiwhere' 'ab' 'abbey'
 'abdomen' 'abeg' 'abel' 'aberdeen' 'abi' 'ability' 'abiola' 'abj' 'able'
 'abnormally' 'aboutas' 'abroad' 'absence' 'absolutely' 'abstract' 'abt'
 'abta' 'aburo' 'abuse' 'abusers' 'ac' 'academic']
Total number of vocab words: 8084


As can be inferred, there are about 8084 words in the text corpus we fetched.

We transform the string messages to numerical vectors to simplify the model-building and training process.

In [17]:
# Convert strings to vectors using BoW
messages_bow = bow_transformer.transform(messages["message"])

# Print the shape of the sparse matrix and count the number of non-zero occurrences
print(f"Shape of sparse matrix: {messages_bow.shape}")
print(f"Amount of non-zero occurrences: {messages_bow.nnz}")

Shape of sparse matrix: (5572, 8084)
Amount of non-zero occurrences: 44211


BoW builds a sparse matrix mapping the occurrence of every word to the corpus vocabulary. Thus, this approach leads to building a sparse matrix, or a matrix that is mostly comprised of zeros. This format allows for the conversion of the text into an interpretable encoding of linguistic information that a model can make use of.

#n the Bag of Words (BoW) section, we learned how BoW’s technique could be enhanced when combined with TF-IDF. Here, we run our BoW vectors through TF-IDF.

In [20]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

# Transform entire BoW into tf-idf corpus
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(5572, 8084)


## XGBoost is a gradient boosting technique that can do both regression and classification. In this case, we will be using an XGBClassifier to classify our text as either "ham" or "spam".

First, we convert the “spam” and “ham” labels to 0 and 1 (or vice-versa) as XGBoost accepts only numerics.

In [23]:
# Convert ham and spam labels to 0 and 1 (or, vice-versa)
FactorResult = pd.factorize(messages["label"])
messages["label"] = FactorResult[0]
messages.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may text fa receive entry questionstd txt ratetcs apply
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though


Next, we split the data to train and test datasets.

In [25]:
# Split the dataset to train and test sets
msg_train, msg_test, label_train, label_test = train_test_split(
    messages_tfidf, messages["label"], test_size=0.2
)

print(f"train dataset features size: {msg_train.shape}")
print(f"train dataset label size: {label_train.shape}")

print(f"test dataset features size: {msg_test.shape}")
print(f"test dataset label size: {label_test.shape}")

train dataset features size: (4457, 8084)
train dataset label size: (4457,)
test dataset features size: (1115, 8084)
test dataset label size: (1115,)


## To train the model, we first install the XGBoost library.

In [27]:
# Install xgboost library
! pip install xgboost



In [28]:
#We train the classifier.
# Train an xgboost classifier
from xgboost import XGBClassifier

# Instantiate our model
clf = XGBClassifier()

# Fit the model to the training data
clf.fit(msg_train, label_train)

In [29]:
# Next, we make predictions on the training dataset.
# Make predictions
predict_train = clf.predict(msg_train)

print(
    f"Accuracy of Train dataset: {metrics.accuracy_score(label_train, predict_train):0.3f}"
)

Accuracy of Train dataset: 0.989


## To get an essence of how our model fared, let’s do an example prediction.

In [31]:
# an example prediction
print(
    "predicted:",
    clf.predict(
        tfidf_transformer.transform(bow_transformer.transform([messages["message"][7]]))
    )[0],
)
print("expected:", messages["label"][7])
### Recall Spam is "1", Ham is "0".

predicted: 0
expected: 0


In [32]:
print(messages["message"][7])

per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune


And yes, it worked!

Finally, we find the overall accuracy of the model on the test data.

In [34]:
# print the overall accuracy of the model
label_predictions = clf.predict(msg_test)
print(f"Accuracy of the model: {metrics.accuracy_score(label_test, label_predictions):0.3f}")

Accuracy of the model: 0.966


In [35]:
# here is an out-of-sample generalized prediction
print(
    "predicted:",
    clf.predict(
        tfidf_transformer.transform(bow_transformer.transform(["Hullo, \
        claim your free luck draw by sending some money"])))[0],
)


predicted: 1


In [36]:
print(messages_tfidf[0:3])

  (0, 7849)	0.23822187576133733
  (0, 7624)	0.19696340342726038
  (0, 5218)	0.2701917502132373
  (0, 3740)	0.2918335915067389
  (0, 3591)	0.3454453072521403
  (0, 2824)	0.19227005405586492
  (0, 2787)	0.16228382867264401
  (0, 2733)	0.15705625796538664
  (0, 1483)	0.26752361109476797
  (0, 1208)	0.2918335915067389
  (0, 897)	0.2918335915067389
  (0, 895)	0.3297648855969494
  (0, 464)	0.2626832171423389
  (0, 233)	0.3454453072521403
  (1, 7750)	0.4313385731102942
  (1, 4838)	0.5462557824449175
  (1, 4806)	0.2741803362458879
  (1, 3776)	0.4080505634471753
  (1, 3559)	0.5233273701797004
  (2, 7803)	0.21180477976633422
  (2, 7764)	0.1655268599567031
  (2, 7295)	0.14030009452821
  (2, 7092)	0.2454421736459655
  (2, 6937)	0.1372922225787766
  (2, 5602)	0.18586214999858788
  (2, 5548)	0.2610826711059533
  (2, 5497)	0.2610826711059533
  (2, 4194)	0.17816738588670292
  (2, 2533)	0.1288894945470838
  (2, 2380)	0.2045670103380392
  (2, 2256)	0.5221653422119066
  (2, 2124)	0.40015658822537653
  (2

In [37]:
print(messages_bow[0:3])

  (0, 233)	1
  (0, 464)	1
  (0, 895)	1
  (0, 897)	1
  (0, 1208)	1
  (0, 1483)	1
  (0, 2733)	1
  (0, 2787)	1
  (0, 2824)	1
  (0, 3591)	1
  (0, 3740)	1
  (0, 5218)	1
  (0, 7624)	1
  (0, 7849)	1
  (1, 3559)	1
  (1, 3776)	1
  (1, 4806)	1
  (1, 4838)	1
  (1, 7750)	1
  (2, 319)	1
  (2, 1319)	1
  (2, 1536)	1
  (2, 2124)	2
  (2, 2256)	2
  (2, 2380)	1
  (2, 2533)	1
  (2, 4194)	1
  (2, 5497)	1
  (2, 5548)	1
  (2, 5602)	1
  (2, 6937)	1
  (2, 7092)	1
  (2, 7295)	1
  (2, 7764)	1
  (2, 7803)	1
