In [1]:
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 

In [2]:
DATA_PATH = "C:\\Elisabeth\\Универ\\Samsung_Smishing\\2021-knu-smish\\elizabeth_skvortsova\\data\\dataset.txt"

COLUMN_LABEL = "LABEL"
COLUMN_TEXT = "CONTEXT"

LABEL_LEGIT = 'LEGI'
LABEL_SPAM = 'SPAM'
LABEL_SMISHING = 'SMIS'

dataset = pd.read_csv(DATA_PATH, sep='\t', names=[COLUMN_LABEL, COLUMN_TEXT], header=None)
print('Total size:', dataset.shape[0])
print('Legit messages:', dataset[dataset[COLUMN_LABEL] == LABEL_LEGIT].shape[0])
print('Spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

dataset = dataset[((dataset[COLUMN_LABEL] == LABEL_LEGIT) | (dataset[COLUMN_LABEL] == LABEL_SPAM))]

# Let's check if they are gone
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

Total size: 1571
Legit messages: 1051
Spam messages: 144
Smishing messages: 376
Smishing messages: 0


In [3]:
def messages2vectors(messages):
    '''
    Transforms single message into feature-vector;
    Parameters:
        messages    -   array of strings;
    Returns:
        features    -   array of feature-vectors;   
    '''

    elmo = hub.Module("https://tfhub.dev/google/elmo/1")

    features = np.zeros((0, 1024))
    n = 100
    l = int(len(messages) / n) + 1 if len(messages) % 2 != 0 else int(len(messages) / n)
    for i in range(l):
        if i * n == len(messages):
            break
        if (i + 1) * n < len(messages):
            right = (i + 1) * n
            embedds = elmo(messages[int(i * n) : right], signature="default", as_dict=True)["default"] 
        else:
            embedds = elmo(messages[:len(messages) - int(i * n)], signature="default", as_dict=True)["default"] 

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            embedds = sess.run(embedds)
            features = np.concatenate([features, embedds])

    return features

In [4]:
def convert_labels(labels_raw):

    # add your code here
    labels = np.zeros(len(labels_raw), 'int')
    for i in range(len(labels_raw)):
        if labels_raw[i] == 'SPAM':
            labels[i] = 1

    return labels

In [5]:
features = messages2vectors(dataset[COLUMN_TEXT])
labels = convert_labels(dataset[COLUMN_LABEL])
print(features.shape)
print(labels.shape)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver

In [6]:
def split_data(features, labels, ratio=0.7):
    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        data    -   array of features;
        labels  -   array of corresponding labels;
        ratio   -   train/test size ratio;
    Returns:
        train_data      -   array of training features;   
        train_labels    -   array of training labels; 
        test_data       -   array of testing features; 
        test_labels     -   array of testing labels; 
    '''    


    positive_data = features[labels == 1] # all spam features
    negative_data = features[labels == 0] # all legit features

    # We shuffle arrays to get random samples later
    random_indecies_positive = np.arange(positive_data.shape[0])
    np.random.shuffle(random_indecies_positive)
    random_indecies_negative = np.arange(negative_data.shape[0])
    np.random.shuffle(random_indecies_negative)

    n_positive_train = int(positive_data.shape[0] * ratio)
    n_negative_train = int(negative_data.shape[0] * ratio)

    # Training data are all indecies in 'ratio' part of shuffled indecies
    train_data = np.concatenate([positive_data[random_indecies_positive[:n_positive_train]], 
                                negative_data[random_indecies_negative[:n_negative_train]]])
    
    train_labels = np.asarray([1] * n_positive_train + [0] * n_negative_train)

    # Testing data are all indecies that remain
    test_data = np.concatenate([positive_data[random_indecies_positive[n_positive_train:]], 
                                negative_data[random_indecies_negative[n_negative_train:]]])

    test_labels = np.asarray([1] * (positive_data.shape[0]  - n_positive_train) + [0] * (negative_data.shape[0] - n_negative_train))

    return train_data, train_labels, test_data, test_labels

In [7]:
def get_metrics(labels, predictions):
    '''
    Computes metrics;
    Parameters:
        labels    -   array of labels;
        predictions  -   array of predictions;
    Returns:
        FAR -   False Acceptance Rate;
        FRR -   False Rejection Rate;
    '''  
    # add your code here
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    FRR = fp / (fp + tn) * 100
    FAR = fn / (tp + fn) * 100
    return FAR, FRR

In [8]:
def evaluate(classifierType, hyperparameters, features, labels):

    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        classifierType      -   type of ML algorithm to use;
        hyperparameters     -   dictionary of model's parameters;
        features            -   array of features;
        labels              -   array of labels
    Returns:
        trainFAR    -   False Acceptance Rate for train dataset;
        trainFRR    -   False Rejection Rate for train dataset;
        testFAR     -   False Acceptance Rate for test dataset;
        testFRR    -   False Rejection Rate for test dataset;
    '''    

    model = GridSearchCV(classifierType(), hyperparameters, n_jobs = -1, refit = 'precision_score')

    # Split data
    train_data, train_labels, test_data, test_labels = split_data(features, labels, ratio=0.7) 

    # Fit your model
    # add your code here
    fitted_model = model.fit(train_data, train_labels)
    # Make predictions for training dataset
    # add your code here
    predict_train = fitted_model.predict(train_data)

    # Compute train FAR/FRR
    # add your code here
    trainFAR, trainFRR = get_metrics(train_labels, predict_train)

    # Make predictions for testing dataset
    # add your code here
    predictions_test = fitted_model.predict(test_data)  
    # Compute test FAR/FRR
    # add your code here
    testFAR, testFRR = get_metrics(test_labels, predictions_test)
    print('\tThe best parametrs are', fitted_model.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

First algorithm - RandomForest


In [32]:
classifierType = sklearn.ensemble.RandomForestClassifier
hyperparameters = {'n_estimators' : list(range(3, 20, 1)) + list(range(20, 400, 50)),
                'criterion' : ['gini'],
                'max_depth' : [None],
                'min_samples_split' : list(range(2, 3, 1)),
                'min_samples_leaf' : list(range(1, 3, 1))}

Сначала проверяла по большему разбросу, в итоге сошлаось к таким вариантам как выше  

In [33]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType, hyperparameters, features, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	The best parametrs are {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 11}
Train:
	FAR: 2.585034013605442
	FRR: 30.0
Test:
	FAR: 5.69620253164557
	FRR: 93.18181818181817


Second algorithm - SVC

In [34]:
classifierType_2 = sklearn.svm.SVC
hyperparameters_2 = { 'C' : [ 1, 2, 3, 4, 5, 6, 7, 8, ], 
                'gamma' : [0.00313, 0.003, 0.0031, 0.0032, 0.029, 0.00328], 
                'kernel' : ['rbf', 'linear'],
                'class_weight' : ['balanced', None]}

Сначала проверяла по большему разбросу, в итоге сошлаось к таким вариантам как выше  

In [159]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType_2, hyperparameters_2, features, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	The best parametrs are {'C': 1, 'class_weight': None, 'gamma': 0.029, 'kernel': 'rbf'}
Train:
	FAR: 0.0
	FRR: 67.0
Test:
	FAR: 1.2658227848101267
	FRR: 86.36363636363636
