# Importing the Libraries

In [4]:
import sys
import nltk
import sklearn
import pandas
import numpy

# Loading the Data set

In [5]:
import pandas as pd

df = pd.read_csv('spam.csv', header=None, encoding='ISO-8859-1')


In [6]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5573 non-null   object
 1   1       5573 non-null   object
 2   2       50 non-null     object
 3   3       12 non-null     object
 4   4       6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB
None
      0                                                  1    2    3    4
0    v1                                            message  NaN  NaN  NaN
1   ham  Go until jurong point, crazy.. Available only ...  NaN  NaN  NaN
2   ham                      Ok lar... Joking wif u oni...  NaN  NaN  NaN
3  spam  Free entry in 2 a wkly comp to win FA Cup fina...  NaN  NaN  NaN
4   ham  U dun say so early hor... U c already then say...  NaN  NaN  NaN


In [7]:
# check class distribution
classes = df[0]
print(classes.value_counts())

0
ham     4825
spam     747
v1         1
Name: count, dtype: int64


# Preprocess the Data

In [8]:
from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])

# Print the column names to find the correct column name for text messages
print(df.columns)


[2 0 0 1 0 0 1 0 0 1]
Index([0, 1, 2, 3, 4], dtype='int64')


In [9]:
text_messages = df[1]
print(text_messages[:10])

0                                              message
1    Go until jurong point, crazy.. Available only ...
2                        Ok lar... Joking wif u oni...
3    Free entry in 2 a wkly comp to win FA Cup fina...
4    U dun say so early hor... U c already then say...
5    Nah I don't think he goes to usf, he lives aro...
6    FreeMsg Hey there darling it's been 3 week's n...
7    Even my brother is not like to speak with me. ...
8    As per your request 'Melle Melle (Oru Minnamin...
9    WINNER!! As a valued network customer you have...
Name: 1, dtype: object


In [10]:
#using regular expressions to replace email addresses, URLs, phone numbers, other numbers

#Replacing email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

#Replacing URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

#Replacing money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
#Replacing 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
#Replacing numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [11]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [12]:
# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()
print(processed)

0                                                 message
1       go until jurong point, crazy.. available only ...
2                           ok lar... joking wif u oni...
3       free entry in 2 a wkly comp to win fa cup fina...
4       u dun say so early hor... u c already then say...
                              ...                        
5568    this is the 2nd time we have tried 2 contact u...
5569                will ì_ b going to esplanade fr home?
5570    pity, * was in mood for that. so...any other s...
5571    the guy did some bitching but i acted like i'd...
5572                           rofl. its true to its name
Name: 1, Length: 5573, dtype: object


In [13]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [14]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

# Generating Function

In [15]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [16]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 8951
Most common words: [('.', 4713), (',', 1872), ('?', 1541), ('!', 1381), ('...', 1131), ('u', 1121), ('&', 916), (';', 764), (':', 717), ('i', 695), ('..', 681), ('call', 642), ("'", 533), (')', 494), ('2', 474)]


In [17]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [18]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Example: Find features in the first processed message
features = find_features(processed[0])

# Print words that are features
for key, value in features.items():
    if value:
        print(key)


messag


In [19]:
import numpy as np


# Now lets do it for all the messages
messages = zip(processed, Y)

# Define a seed for reproducibility
seed = 1
np.random.seed(seed)  # Use np.random.seed(seed) to set the random seed


# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [20]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [21]:
print(len(training))
print(len(testing))

4179
1394


# Scikit-Learn Classifiers with NLTK

In [22]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.49354375896701


In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.classify import SklearnClassifier  # Import SklearnClassifier

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list(zip(names, classifiers))  # Use list() to convert the zip to a list

# Assuming you have your 'training' and 'testing' datasets prepared correctly
# You should have something like 'training' as a list of tuples: [(featureset1, label1), (featureset2, label2), ...]

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print("{} Accuracy: {:.2f}%".format(name, accuracy))


K Nearest Neighbors Accuracy: 92.47%
Decision Tree Accuracy: 96.05%
Random Forest Accuracy: 97.99%
Logistic Regression Accuracy: 98.42%
SGD Classifier Accuracy: 97.99%
Naive Bayes Accuracy: 98.35%
SVM Linear Accuracy: 98.49%


In [25]:
import nltk  # Import NLTK or any other required libraries
from nltk.classify import NaiveBayesClassifier  # Import the classifier you're using

# Training your ensemble classifier with the training data
nltk_ensemble = NaiveBayesClassifier.train(training)

# Making class label predictions for the testing set
txt_features, labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)


In [26]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1214
           1       0.98      0.91      0.94       180

    accuracy                           0.99      1394
   macro avg       0.98      0.95      0.97      1394
weighted avg       0.99      0.99      0.99      1394



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1210,4
actual,spam,16,164
