## 1. Download dataset

In [1]:
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 62.5MB/s]


## 2. Import libraries

In [2]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 3. Read data

In [3]:
DATASET_PATH ='/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 4. Preprocessing data


### 4.1 Preprocessing features
Converts all text to lowercase -> Eliminate all punctuation marks -> Splits the text into individual words (tokens) -> Filters out common words that dont carry significant meaning -> Reduces words to their root form, grouping similar words together

In [4]:
def lowercase(text):
  return text.lower()

def punctuation_removal(text):
  translator = str.maketrans('', '', string.punctuation)

  return text.translate(translator)

def tokenize(text):
  return nltk.word_tokenize(text)

def remove_stopwords(tokens):
  stopwords = nltk.corpus.stopwords.words('english')

  return [token for token in tokens if token not in stopwords]

def stemming(tokens):
  stemmer = nltk.PorterStemmer()

  return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
  text = lowercase(text)
  text = punctuation_removal(text)
  tokens = tokenize(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)

  return tokens

messages = [preprocess_text(message) for message in messages]

**Create a dictionary from all unique tokenss across all mails**

In [5]:
def create_dictionary(messages):
  dictionary = []

  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)

  return dictionary

dictionary = create_dictionary(messages)

**Create features (counts how many tiems each word from the Vocabulary appears in a message)**

In [6]:
def create_features (tokens, dictionary):
  features = np.zeros(len(dictionary))

  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1

  return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])

## 4.2 Preprocessing label data

In [7]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


## 5.1 Split the dataset to train/val/test

In [8]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=VAL_SIZE,
                                                  random_state=SEED)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val,
                                                  test_size=TEST_SIZE,
                                                  random_state=SEED)

## 6. Train model

In [None]:
  model = GaussianNB()
  print(f'Training model...')
  model.fit(X_train, y_train)
  print('Done!')

Training model...


## 7. Evaluate model

In [None]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Validation accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

## 8. Predict

In [None]:
def predict(text, model, dictionary):
  processed_text = preprocess_text(text)
  features = create_features(processed_text, dictionary)
  features = np.array(features).reshape(1, -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]

  return prediction_cls

spam_test_input = 'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C'
prediction = predict(spam_test_input, model, dictionary)
print(f'Prediction: {prediction}')

ham_test_input = 'I am actually thinking a way of doing something useful'
prediction = predict(ham_test_input, model, dictionary)
print(f'Prediction: {prediction}')