In [235]:
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import pandas as pd
import sklearn
import re
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [236]:
# specify location your dataset here
DATA_PATH = "SMS_dataset_preparation.txt"

# give name to label-column and text-column
COLUMN_LABEL = "lable"
COLUMN_TEXT = "text"

# these are labels that indicate the type of message.
LABEL_LEGIT = 'LEGI'
LABEL_SPAM = 'SPAM'
LABEL_SMISHING = 'SMIS'

In [237]:
dataset = pd.read_csv(DATA_PATH, sep='\t', names=[COLUMN_LABEL, COLUMN_TEXT], header=None)
print('Total size:', dataset.shape[0])
print('Legit messages:', dataset[dataset[COLUMN_LABEL] == LABEL_LEGIT].shape[0])
print('Spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

Total size: 1509
Legit messages: 1060
Spam messages: 163
Smishing messages: 286


In [238]:
def find_url(message):
  r = re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
  url = r.findall(message)
  return [i[0] for i in url]

In [239]:
def find_numb(message):
  r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
  numb = r.findall(message)
  return numb

In [240]:
def find_email(message):
  r = re.compile(r'(\b[\w.]+@+[\w.]+.+[\w.]\b)')
  email = r.findall(message)
  return email

In [241]:
def messages2vectors(messages, size):
  '''
    Transforms single message into feature-vector;
    Parameters:
        messages    -   array of strings;
    Returns:
        features    -   array of feature-vectors;   
    '''

  features = np.zeros((size, 10))
  count = 0

  greet_key = ['good morn', 'good afternoon', 'good even', 'good night',
               'hi', 'hello', 'hey',
               'gud nyt', 'good ni8', 'good nyt', 'goodnight']

  emotion_key = ['love', 'nice', 'sweet', 'happi',
                 'sad', 'angri', 'hurt', 'nasti', ':)', ':(']
  
  mat_symb = ['|', '{', '@', '[', '<', '!', '+', '(', '$', '/', '%', '^']

  sal = ['call', 'claim', 'reciev', 'click', 'enter',
         'visit', 'repli', 'send', 'contact', 'appli',
         'follow', 'subscrib', 'unsubscrib', 'answer']

  money_symb = ['UAH', '£', '$', '€']

  smis_key = ['award', 'congratul', 'winner', 'alert', 'claim',
              'activat', 'verifi', 'attempts', 'gift', 'voucher',
              'block', 'suspend', 'unlock', 'won', 'prize',
              'subscrib', 'activ', 'updat', 'coupon', 'refund']

  for f in features:
    #print('\nMessage: ', messages[count])
    f[0] = 1
    f[1] = 1

    for key in greet_key:
      if key in messages[count]:
        #print('greet: ', key)
        f[0] = 0
    
    for key in emotion_key:
      if key in messages[count]:
        #print('emotion: ', key)
        f[1] = 0
    
    url = find_url(messages[count])
    if url != []:
      f[2] = 1
    
    for symb in mat_symb:
      if symb in messages[count]:
        f[3] = 1
    
    if len(messages[count]) >= 150:
      f[4] = 1

    for sign in sal:
      if sign in messages[count]:
        #print('sal: ', sign)
        f[5] = 1

    for symb in money_symb:
      if symb in messages[count]:
        f[6] = 1
      
    for sign in smis_key:
      if sign in messages[count]:
        #print('smis: ', sign)
        f[7] = 1
    
    email = find_email(messages[count])
    if email != []:
      f[8] = 1

    number = find_numb(messages[count])
    if number != []:
      f[9] = 1
    
    count += 1
  return features

In [242]:
def convert_messages(messages):
    ps = PorterStemmer()
    dataset = []
    for message in messages:
      #print('message: ', message)
      new_message = message.lower()
      url = find_url(new_message)
      if url != []:
        #print('url: ', url[0])
        new_message = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", '', new_message, flags=re.MULTILINE)
      new_message_split = word_tokenize(new_message)
      sms_stem = [ps.stem(word) for word in new_message_split]
      sms = ' '.join(sms_stem)
      if url != []:
        sms += ' ' + url[0]
      dataset.append(sms)
    return dataset

In [243]:
def convert_labels(labels_raw):
    '''
    Transforms labels into numerical values;
    Parameters:
        labels_raw    -   array of text-labels;
    Returns:
        features    -   array of numerical labels;
    '''

    label = []
    for lab in labels_raw:
        if lab == 'LEGI' or lab == 'SPAM':
            label.append(0)
        elif lab == 'SMIS':
            label.append(1)
    labels = np.array(label)

    return labels


In [244]:
dataset_steem = convert_messages(dataset[COLUMN_TEXT])
features = messages2vectors(dataset_steem, dataset.shape[0])
labels = convert_labels(dataset[COLUMN_LABEL])
print(features.shape)
print(labels.shape)

(1509, 10)
(1509,)


In [245]:
def split_data(features, labels, ratio=0.7):
    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        data    -   array of features;
        labels  -   array of corresponding labels;
        ratio   -   train/test size ratio;
    Returns:
        train_data      -   array of training features;   
        train_labels    -   array of training labels; 
        test_data       -   array of testing features; 
        test_labels     -   array of testing labels; 
    '''    


    positive_data = features[labels == 1] # all spam features
    negative_data = features[labels == 0] # all legit features

    # We shuffle arrays to get random samples later
    random_indecies_positive = np.arange(positive_data.shape[0])
    np.random.shuffle(random_indecies_positive)
    random_indecies_negative = np.arange(negative_data.shape[0])
    np.random.shuffle(random_indecies_negative)

    n_positive_train = int(positive_data.shape[0] * ratio)
    n_negative_train = int(negative_data.shape[0] * ratio)

    # Training data are all indecies in 'ratio' part of shuffled indecies
    train_data = np.concatenate([positive_data[random_indecies_positive[:n_positive_train]], 
                                negative_data[random_indecies_negative[:n_negative_train]]])
    
    train_labels = np.asarray([1] * n_positive_train + [0] * n_negative_train)

    # Testing data are all indecies that remain
    test_data = np.concatenate([positive_data[random_indecies_positive[n_positive_train:]], 
                                negative_data[random_indecies_negative[n_negative_train:]]])

    test_labels = np.asarray([1] * (positive_data.shape[0]  - n_positive_train) + [0] * (negative_data.shape[0] - n_negative_train))

    return train_data, train_labels, test_data, test_labels

In [246]:
def get_metrics(labels, predictions):
    '''
    Computes metrics;
    Parameters:
        labels    -   array of labels;
        predictions  -   array of predictions;
    Returns:
        FAR -   False Acceptance Rate;
        FRR -   False Rejection Rate;
    '''  
    TN, FP, FN, TP = confusion_matrix(labels, predictions).ravel()
    FAR = FP/(FP + TN)
    FRR = FN/(FN + TP)
    return FAR*100, FRR*100

In [247]:
def evaluate(classifierType, hyperparameters, features, labels):
    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        classifierType      -   type of ML algorithm to use;
        hyperparameters     -   dictionary of model's parameters;
        features            -   array of features;
        labels              -   array of labels
    Returns:
        trainFAR    -   False Acceptance Rate for train dataset;
        trainFRR    -   False Rejection Rate for train dataset;
        testFAR     -   False Acceptance Rate for test dataset;
        testFRR    -   False Rejection Rate for test dataset;
    '''    

    model = classifierType(**hyperparameters)

    # Split data
    train_data, train_labels, test_data, test_labels = split_data(features, labels)

    #print('Train set shape:', train_data.shape)
    #print('Train labels shape:', train_labels.shape)
    #print('Test set shape:', test_data.shape)
    #print('Test labels shape:', test_labels.shape)

    # Fit your model
    fit_model = model.fit(train_data, train_labels)


    # Make predictions for training dataset
    predictions_train =     fit_model.predict(train_data)


    # Compute train FAR/FRR
    trainFAR, trainFRR = get_metrics(train_labels, predictions_train)

    # Make predictions for testing dataset
    predictions_test = fit_model.predict(test_data)

    # Compute test FAR/FRR
    testFAR, testFRR = get_metrics(test_labels, predictions_test)

    return trainFAR, trainFRR, testFAR, testFRR

In [248]:
classifierType1 = sklearn.ensemble.RandomForestClassifier
hyperparameters1 = {'n_estimators' : 100,
                'criterion' : 'gini',
                'max_depth' : None,
                'min_samples_split' : 2}

In [249]:
# Check if it works :)
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType1, hyperparameters1, features, labels)
print("\tRandom Forest")
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)


print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	Random Forest
Train:
	FAR: 3.2710280373831773
	FRR: 8.5
Test:
	FAR: 5.177111716621254
	FRR: 18.6046511627907


In [250]:
classifierType2 = sklearn.svm.SVC
hyperparameters2 = {'C': 1.0,
                   'kernel': 'rbf',
                   'degree': 0.03,
                   'gamma': 'scale'}

In [251]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType2, hyperparameters2, features, labels)
print('\tSVM')
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	SVM
Train:
	FAR: 3.9719626168224296
	FRR: 17.0
Test:
	FAR: 4.904632152588556
	FRR: 11.627906976744185


In [252]:
classifierType3 = sklearn.neighbors.KNeighborsClassifier
hyperparameters3 = {'n_neighbors': 5,
                   'weights': 'uniform',
                   'algorithm': 'auto',
                   'leaf_size': 30, 
                   'p': 2}

In [253]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType3, hyperparameters3, features, labels)
print('\tK Neighbors')
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	K Neighbors
Train:
	FAR: 7.009345794392523
	FRR: 8.0
Test:
	FAR: 7.3569482288828345
	FRR: 15.11627906976744
