In [1]:
import pandas as pd

## Data Load

In [2]:
df = pd.read_csv("dataset/SMSSpamCollection.csv", delimiter="\t", header=None, names=["label", "text"])
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [5]:
df.duplicated().sum()

403

In [6]:
df = df.drop_duplicates()
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
df["label"].value_counts()

label
ham     4516
spam     653
Name: count, dtype: int64

## Preprocessing

## Stemming

In [None]:
!python3 -c "import nltk; nltk.download('stopwords'); nltk.download('punkt')"

In [13]:
import nltk
# nltk.download("punkt")
# nltk.download("stopwords")
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize


stemmer = PorterStemmer()


def word_stemmer(words):
    stems = [stemmer.stem(word) for word in words]
    return stems


def clean_text(text):
    text = word_tokenize(text.lower())
    
    # Remove punctuation and stop words
    new = []
    for token in text:
        if token in stopwords.words("english") or token in string.punctuation:
            continue
        
        new.append(token)

    # Stemming
    text = word_stemmer(new)

    return " ".join(text)


In [14]:
df["cleaned_text"] = df["text"].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cleaned_text"] = df["text"].apply(clean_text)


In [15]:
df

Unnamed: 0,label,text,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi .. avail bugi n great wo...
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah n't think goe usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u. u £750 pound prize 2...
5568,ham,Will ü b going to esplanade fr home?,ü b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood ... suggest
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like 'd interest buy someth els ...


## Vectorization

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = None
vectorizer = TfidfVectorizer(max_features=3000)

x = vectorizer.fit_transform(df["cleaned_text"]).toarray()
y = df["label"].values

## Train Text Split

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

positive_class = "spam"

In [19]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Implementation

### Metrics

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def calculate_metrics(_y_test, _y_prediction):
    print("Accuracy, Precision, Recall, F1-Score", sep="\t")

    accuracy = accuracy_score(_y_test, _y_prediction)
    precision = precision_score(_y_test, _y_prediction, pos_label=positive_class)
    recall = recall_score(_y_test, _y_prediction, pos_label=positive_class)
    f1 = f1_score(_y_test, _y_prediction, pos_label=positive_class)

    print("accuracy: ", accuracy)
    print("precision: ", precision)
    print("recall: ", recall)
    print("f1: ", f1)
    
    confusion = confusion_matrix(_y_test, y_pred=_y_prediction)
    print(confusion)

## Comparing different Naive Bayes models

In [21]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB, MultinomialNB

models_to_compare = {
    "GaussianNB": GaussianNB,
    "BernoulliNB": BernoulliNB,
    "ComplementNB": ComplementNB,
    "MultinomialNB": MultinomialNB,
}

def train_and_test(model_class):
    model = model_class()
    model.fit(x_train, y_train)

    y_prediction = model.predict(x_test)
    calculate_metrics(y_test, y_prediction)


for model_name, model_class in models_to_compare.items():
    print("-" * 50)
    print(model_name)
    print("-" * 50)

    train_and_test(model_class=model_class)

    print()
    


--------------------------------------------------
GaussianNB
--------------------------------------------------
Accuracy, Precision, Recall, F1-Score
accuracy:  0.8481624758220503
precision:  0.4125
recall:  0.8608695652173913
f1:  0.5577464788732394
[[778 141]
 [ 16  99]]

--------------------------------------------------
BernoulliNB
--------------------------------------------------
Accuracy, Precision, Recall, F1-Score
accuracy:  0.9912959381044487
precision:  1.0
recall:  0.9217391304347826
f1:  0.9592760180995475
[[919   0]
 [  9 106]]

--------------------------------------------------
ComplementNB
--------------------------------------------------
Accuracy, Precision, Recall, F1-Score
accuracy:  0.9555125725338491
precision:  0.7254901960784313
recall:  0.9652173913043478
f1:  0.8283582089552239
[[877  42]
 [  4 111]]

--------------------------------------------------
MultinomialNB
--------------------------------------------------
Accuracy, Precision, Recall, F1-Score
accura

## Storing the model

From the above output, the best model seems to be BernoulliNB

In [22]:
best_model = BernoulliNB()
best_model.fit(x_train, y_train)

y_prediction = best_model.predict(x_test)
calculate_metrics(y_test, y_prediction)

Accuracy, Precision, Recall, F1-Score
accuracy:  0.9912959381044487
precision:  1.0
recall:  0.9217391304347826
f1:  0.9592760180995475
[[919   0]
 [  9 106]]


In [29]:
import pickle

model_filename = "spam_classification_model.pkl"
pickle.dump(best_model, open(model_filename, "wb"))
vectorizer_filename = "vectorizer.pkl"
pickle.dump(vectorizer, open(vectorizer_filename, "wb"))

## Testing on input

In [33]:
input = "Last weekends draw shows that you have won a nine hundred pounds prize GUARANTEED"
cleaned_input = clean_text(input)
cleaned_input_vec = vectorizer.transform([cleaned_input])

prediction = best_model.predict(cleaned_input_vec)
print(prediction[0])

spam
