# SPAM DETECTION GUIDE


## Preparation

In [None]:
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# specify location your dataset here
DATA_PATH = "SMS_dataset_preparation.txt"

# give name to label-column and text-column
COLUMN_LABEL = "lable"
COLUMN_TEXT = "text"

# these are labels that indicate the type of message.
LABEL_LEGIT = 'LEGI'
LABEL_SPAM = 'SPAM'
LABEL_SMISHING = 'SMIS'

## Dataset 

In [None]:
dataset = pd.read_csv(DATA_PATH, sep='\t', names=[COLUMN_LABEL, COLUMN_TEXT], header=None)
print('Total size:', dataset.shape[0])
print('Legit messages:', dataset[dataset[COLUMN_LABEL] == LABEL_LEGIT].shape[0])
print('Spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

Total size: 1509
Legit messages: 1060
Spam messages: 163
Smishing messages: 286


In [None]:
dataset = dataset[((dataset[COLUMN_LABEL] == LABEL_LEGIT) | (dataset[COLUMN_LABEL] == LABEL_SPAM))]

# Let's check if they are gone
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

Smishing messages: 0


## Data preprocessing 

In [None]:
def messages2vectors(messages):
    '''
    Transforms single message into feature-vector;
    Parameters:
        messages    -   array of strings;
    Returns:
        features    -   array of feature-vectors;   
    '''

    elmo = hub.Module("https://tfhub.dev/google/elmo/1")

    features = np.zeros((0, 1024))
    n = 100
    l = int(len(messages) / n) + 1 if len(messages) % 2 != 0 else int(len(messages) / n)
    for i in range(l):
        if i * n == len(messages):
            break
        if (i + 1) * n < len(messages):
            right = (i + 1) * n
            embedds = elmo(messages[int(i * n) : right], signature="default", as_dict=True)["default"] 
        else:
            embedds = elmo(messages[:len(messages) - int(i * n)], signature="default", as_dict=True)["default"] 

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            embedds = sess.run(embedds)
            features = np.concatenate([features, embedds])

    return features

In [None]:
def convert_labels(labels_raw):
    '''
    Transforms labels into numerical values;
    Parameters:
        labels_raw    -   array of text-labels;
    Returns:
        features    -   array of numerical labels;   
    ''' 

    label = []
    for lab in labels_raw:
        if lab == 'LEGI':
            label.append(0)
        elif lab == 'SPAM':
            label.append(1)
    labels = np.array(label)

    return labels

In [None]:
features = messages2vectors(dataset[COLUMN_TEXT])
labels = convert_labels(dataset[COLUMN_LABEL])
print(features.shape)
print(labels.shape)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


(1223, 1024)
(1223,)


In [None]:
def split_data(features, labels, ratio=0.7):
    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        data    -   array of features;
        labels  -   array of corresponding labels;
        ratio   -   train/test size ratio;
    Returns:
        train_data      -   array of training features;   
        train_labels    -   array of training labels; 
        test_data       -   array of testing features; 
        test_labels     -   array of testing labels; 
    '''    


    positive_data = features[labels == 1] # all spam features
    negative_data = features[labels == 0] # all legit features

    # We shuffle arrays to get random samples later
    random_indecies_positive = np.arange(positive_data.shape[0])
    np.random.shuffle(random_indecies_positive)
    random_indecies_negative = np.arange(negative_data.shape[0])
    np.random.shuffle(random_indecies_negative)

    n_positive_train = int(positive_data.shape[0] * ratio)
    n_negative_train = int(negative_data.shape[0] * ratio)

    # Training data are all indecies in 'ratio' part of shuffled indecies
    train_data = np.concatenate([positive_data[random_indecies_positive[:n_positive_train]], 
                                negative_data[random_indecies_negative[:n_negative_train]]])
    
    train_labels = np.asarray([1] * n_positive_train + [0] * n_negative_train)

    # Testing data are all indecies that remain
    test_data = np.concatenate([positive_data[random_indecies_positive[n_positive_train:]], 
                                negative_data[random_indecies_negative[n_negative_train:]]])

    test_labels = np.asarray([1] * (positive_data.shape[0]  - n_positive_train) + [0] * (negative_data.shape[0] - n_negative_train))

    return train_data, train_labels, test_data, test_labels



## Metrics

In [None]:
def get_confusion_matrix_values(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])


In [None]:
def get_metrics(labels, predictions):
    '''
    Computes metrics;
    Parameters:
        labels    -   array of labels;
        predictions  -   array of predictions;
    Returns:
        FAR -   False Acceptance Rate;
        FRR -   False Rejection Rate;
    '''  
    TN, FP, FN, TP = confusion_matrix(labels, predictions)
    FAR = FP/(FP + TN)
    FRR = FN/(FN + TP)
    return FAR, FRR

## Model initialization

## Model Training and evaluation

In [None]:
def evaluate(classifierType, hyperparameters, features, labels):
    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        classifierType      -   type of ML algorithm to use;
        hyperparameters     -   dictionary of model's parameters;
        features            -   array of features;
        labels              -   array of labels
    Returns:
        trainFAR    -   False Acceptance Rate for train dataset;
        trainFRR    -   False Rejection Rate for train dataset;
        testFAR     -   False Acceptance Rate for test dataset;
        testFRR    -   False Rejection Rate for test dataset;
    '''    

    model = classifierType(**hyperparameters)

    # Split data
    train_data, train_labels, test_data, test_labels = split_data(features, labels)

    print('Train set shape:', train_data.shape)
    print('Train labels shape:', train_labels.shape)
    print('Test set shape:', test_data.shape)
    print('Test labels shape:', test_labels.shape)

    # Fit your model
    fit_model = model.fit(train_data, train_labels)


    # Make predictions for training dataset
    predictions_train =     fit_model.predict(train_data)


    # Compute train FAR/FRR
    trainFAR, trainFRR = get_metrics(train_labels, predictions_train)

    # Make predictions for testing dataset
    predictions_test = fit_model.predict(test_data)

    # Compute test FAR/FRR
    testFAR, testFRR = get_metrics(test_labels, predictions_test)

    return trainFAR, trainFRR, testFAR, testFRR

**Random Forest**

In [None]:
classifierType = sklearn.ensemble.RandomForestClassifier
hyperparameters = {'n_estimators' : 100,
                'criterion' : 'gini',
                'max_depth' : None,
                'min_samples_split' : 2}

In [None]:
# Check if it works :)
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType, hyperparameters, features, labels)
print('\n')
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)


print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

Train set shape: (856, 1024)
Train labels shape: (856,)
Test set shape: (367, 1024)
Test labels shape: (367,)


Train:
	FAR: 0.009523809523809525
	FRR: 0.013315579227696404
Test:
	FAR: 0.16216216216216217
	FRR: 0.05454545454545454


## Final Task

**SVM**

In [None]:
classifierType = sklearn.svm.SVC
hyperparameters = {'C': 1.0,
                   'kernel': 'rbf',
                   'degree': 0.03,
                   'gamma': 'scale'}

In [None]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType, hyperparameters, features, labels)
print('\n')
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

Train set shape: (856, 1024)
Train labels shape: (856,)
Test set shape: (367, 1024)
Test labels shape: (367,)


Train:
	FAR: 0.011363636363636364
	FRR: 0.03515625
Test:
	FAR: 0.10810810810810811
	FRR: 0.048484848484848485


**K Neighbors**

In [None]:
classifierType = sklearn.neighbors.KNeighborsClassifier
hyperparameters = {'n_neighbors': 5,
                   'weights': 'uniform',
                   'algorithm': 'auto',
                   'leaf_size': 30, 
                   'p': 2}

In [None]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType, hyperparameters, features, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

Train set shape: (856, 1024)
Train labels shape: (856,)
Test set shape: (367, 1024)
Test labels shape: (367,)
Train:
	FAR: 0.023529411764705882
	FRR: 0.040207522697795074
Test:
	FAR: 0.02857142857142857
	FRR: 0.045180722891566265
