In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_data = pd.read_csv('data/sms-spam-data.csv', encoding = "ISO-8859-1")

In [3]:
data = raw_data[["v1", "v2"]].rename(columns={"v1": "label", "v2": "text"})
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data = pd.get_dummies(data, columns=["label"], drop_first=True).rename(columns={"label_spam": "label"})

In [5]:
data.value_counts("label")

label
0    4825
1     747
dtype: int64

In [6]:
def count_words(string):
    return len(string.split(" "))

In [7]:
def get_sample_count_ratio(df):
    n = df.shape[0]
    word_counts = []
    for row in df["text"]:
        word_counts.append(count_words(row))
    avg_word_count = sum(word_counts)/len(word_counts)
    return n/avg_word_count

In [8]:
get_sample_count_ratio(data)

357.0242292521935

According to https://developers.google.com/machine-learning/guides/text-classification/step-2-5 this indicates that we should go with an n-gram preprocessing step couple with a simple MLP model

We proceed to experimenting with an n-gram based model. The pre-processing steps are as follows:
- Tokenize text-sample into 1 and 2 word n-grams. This mean extracing each individual word as well as each pair of consecutive words. The combination of both steps
- Vectorize the samples using a TF-IDF encoding scheme. Each piece of text is converted into a vector capturing which n-grams are present in it.
- Drop the least common n-gram tokens by discarding those that occur fewer than two times and and using statistical tests to determine feature importance

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [10]:
# n-gram sizes to compute
NGRAM_RANGE = (1, 2)
# Limit on the number of features
TOP_K = 10000
# Whether text should be split into word or character n-grams
TOKEN_MODE = 'word'
# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

In [11]:
from typing import List
def ngram_vectorize(train_text: List[str], train_labels: np.ndarray, test_text: List[str]):
    # Arguments for vectorizor
    kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
    }

    vectorizer = TfidfVectorizer(**kwargs)
    
    # Vectorize training text
    x_train = vectorizer.fit_transform(train_text)
    # Vectorize test text
    x_test = vectorizer.transform(test_text)

    # Select top k features
    selector = SelectKBest(f_classif, k=TOP_K)
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_test = selector.transform(x_test).astype('float32')

    return x_train, x_test

Now we need to prepare the data. First we create a train and test split and then wrangle the results into the appropriate format for our `ngram_vectorize` function

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=99)

In [14]:
x_train = list(x_train)
y_train = y_train.to_numpy()
x_test = list(x_test)
y_test = y_test.to_numpy()

In [15]:
x_train, x_test = ngram_vectorize(x_train, y_train, x_test)



In [16]:
import tensorflow as tf

In [17]:
from typing import Tuple
def build_model(layers: int, units: int, dropout_rate: float, input_shape: Tuple):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(units, input_shape=input_shape))
    model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    for _ in range(layers-1):
        model.add(tf.keras.layers.Dense(units, activation='relu'))
        model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    return model

In [22]:
def train_model(train_data,
                train_labels,
                test_data,
                test_labels,
                learning_rate=1e-3,
                epochs=100,
                batch_size=32,
                layers=2,
                units=64,
                dropout_rate=0.2):
    # Create model
    model = build_model(layers=layers,units=units, dropout_rate=dropout_rate, input_shape=x_train.shape[1:])

    # Compile model
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    loss = 'binary_crossentropy'
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc', tf.keras.metrics.FalseNegatives(name="fn")])

    # Create early stopping callback
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]

    # Train model
    history = model.fit(train_data, 
              train_labels, 
              epochs=epochs,
              callbacks=callbacks,
              validation_data=(test_data, test_labels),
              verbose=2,
              batch_size=batch_size)

    # Print results
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}, false negatives: {fn}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1], fn=history['fn'][-1]))

In [23]:
train_model(x_train, y_train, x_test, y_test)



Epoch 1/100
140/140 - 1s - loss: 0.2740 - acc: 0.9100 - fn: 382.0000 - val_loss: 0.0614 - val_acc: 0.9839 - val_fn: 17.0000
Epoch 2/100
140/140 - 1s - loss: 0.0318 - acc: 0.9917 - fn: 29.0000 - val_loss: 0.0369 - val_acc: 0.9892 - val_fn: 7.0000
Epoch 3/100
140/140 - 1s - loss: 0.0126 - acc: 0.9964 - fn: 11.0000 - val_loss: 0.0380 - val_acc: 0.9892 - val_fn: 10.0000
Epoch 4/100
140/140 - 1s - loss: 0.0048 - acc: 0.9984 - fn: 5.0000 - val_loss: 0.0402 - val_acc: 0.9901 - val_fn: 10.0000
Epoch 5/100
140/140 - 1s - loss: 0.0026 - acc: 0.9996 - fn: 2.0000 - val_loss: 0.0418 - val_acc: 0.9901 - val_fn: 10.0000
Validation accuracy: 0.9901345372200012, loss: 0.04184374958276749, false negatives: 2.0


This model seems to be doing remarkably well, only misclassifying 2 spam messages