In [1]:
import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 

In [2]:
DATA_PATH = "C:\\Elisabeth\\Универ\\Samsung_Smishing\\2021-knu-smish\\elizabeth_skvortsova\\data\\dataset.txt"

# give name to label-column and text-column
COLUMN_LABEL = "class"
COLUMN_TEXT = "context"

# these are labels that indicate the type of message.
LABEL_LEGIT = 'LEGI'
LABEL_SPAM = 'SPAM'
LABEL_SMISHING = 'SMIS'

dataset = pd.read_csv(DATA_PATH, sep='\t', names=[COLUMN_LABEL, COLUMN_TEXT], header=None)
print('Total size:', dataset.shape[0])
print('Legit messages:', dataset[dataset[COLUMN_LABEL] == LABEL_LEGIT].shape[0])
print('Spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])


dataset = dataset[((dataset[COLUMN_LABEL] == LABEL_LEGIT) | (dataset[COLUMN_LABEL] == LABEL_SMISHING))]

# Let's check if they are gone
print('spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])

Total size: 1571
Legit messages: 1051
Spam messages: 144
Smishing messages: 376
spam messages: 0


In [3]:
def messages2vectors(messages):
    '''
    Transforms single message into feature-vector;
    Parameters:
        messages    -   array of strings;
    Returns:
        features    -   array of feature-vectors;   
    '''

    elmo = hub.Module("https://tfhub.dev/google/elmo/1")

    features = np.zeros((0, 1024))
    n = 100
    l = int(len(messages) / n) if len(messages) % n == 0 else int(len(messages) / n) + 1
    for i in range(l):

        if (i + 1) * n < len(messages):
            right = (i + 1) * n
            embedds = elmo(messages[int(i * n) : right], signature="default", as_dict=True)["default"] 
        else:
            embedds = elmo(messages[:len(messages) - int(i * n)], signature="default", as_dict=True)["default"] 

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            embedds = sess.run(embedds)
            features = np.concatenate([features, embedds])

    return features

In [4]:
def convert_labels(labels_raw):

    # add your code here
    labels = np.zeros(len(labels_raw), 'int')
    for i in range(len(labels_raw)):
        if labels_raw[i] == 'SMIS':
            labels[i] = 1

    return labels

In [5]:
features = messages2vectors(dataset[COLUMN_TEXT])
labels = convert_labels(dataset[COLUMN_LABEL])
print(features.shape)
print(labels.shape)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver

In [6]:
def split_data(features, labels, ratio=0.7):
    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        data    -   array of features;
        labels  -   array of corresponding labels;
        ratio   -   train/test size ratio;
    Returns:
        train_data      -   array of training features;   
        train_labels    -   array of training labels; 
        test_data       -   array of testing features; 
        test_labels     -   array of testing labels; 
    '''    


    positive_data = features[labels == 1] # all smish features
    negative_data = features[labels == 0] # all legit features

    # We shuffle arrays to get random samples later
    random_indecies_positive = np.arange(positive_data.shape[0])
    np.random.shuffle(random_indecies_positive)
    random_indecies_negative = np.arange(negative_data.shape[0])
    np.random.shuffle(random_indecies_negative)

    n_positive_train = int(positive_data.shape[0] * ratio)
    n_negative_train = int(negative_data.shape[0] * ratio)

    # Training data are all indecies in 'ratio' part of shuffled indecies
    train_data = np.concatenate([positive_data[random_indecies_positive[:n_positive_train]], 
                                negative_data[random_indecies_negative[:n_negative_train]]])
    
    train_labels = np.asarray([1] * n_positive_train + [0] * n_negative_train)

    # Testing data are all indecies that remain
    test_data = np.concatenate([positive_data[random_indecies_positive[n_positive_train:]], 
                                negative_data[random_indecies_negative[n_negative_train:]]])

    test_labels = np.asarray([1] * (positive_data.shape[0]  - n_positive_train) + [0] * (negative_data.shape[0] - n_negative_train))

    return train_data, train_labels, test_data, test_labels

In [7]:
def get_metrics(labels, predictions):
    '''
    Computes metrics;
    Parameters:
        labels    -   array of labels;
        predictions  -   array of predictions;
    Returns:
        FAR -   False Acceptance Rate;
        FRR -   False Rejection Rate;
    '''  
    # add your code here
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    FAR = fp / (fp + tn) * 100
    FRR = fn / (tp + fn) * 100
    return FAR, FRR

In [8]:
def evaluate(classifierType, hyperparameters, features, labels):

    '''
    Splits dataset into train/test parts using given ratio;
    Parameters:
        classifierType      -   type of ML algorithm to use;
        hyperparameters     -   dictionary of model's parameters;
        features            -   array of features;
        labels              -   array of labels
    Returns:
        trainFAR    -   False Acceptance Rate for train dataset;
        trainFRR    -   False Rejection Rate for train dataset;
        testFAR     -   False Acceptance Rate for test dataset;
        testFRR    -   False Rejection Rate for test dataset;
    '''    

    model = GridSearchCV(classifierType(), hyperparameters, n_jobs = -1, refit = 'precision_score')

    # Split data
    train_data, train_labels, test_data, test_labels = split_data(features, labels, ratio=0.7) 

    # Fit your model
    # add your code here
    fitted_model = model.fit(train_data, train_labels)
    # Make predictions for training dataset
    # add your code here
    predict_train = fitted_model.predict(train_data)

    # Compute train FAR/FRR
    # add your code here
    trainFAR, trainFRR = get_metrics(train_labels, predict_train)

    # Make predictions for testing dataset
    # add your code here
    predictions_test = fitted_model.predict(test_data)  
    # Compute test FAR/FRR
    # add your code here
    testFAR, testFRR = get_metrics(test_labels, predictions_test)
    print('\tThe best parametrs are', fitted_model.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

First algorithm - RandomForest


In [20]:
classifierType = sklearn.ensemble.RandomForestClassifier
hyperparameters = {'n_estimators' : list(range(80, 150, 10)),
                'criterion' : ['gini'],
                'max_depth' : [None],
                'min_samples_split' : [2],
                'min_samples_leaf' : [2]}

Сначала проверяла по большему разбросу, в итоге сошлаось к таким вариантам как выше  

In [21]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType, hyperparameters, features, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	The best parametrs are {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 140}
Train:
	FAR: 0.0
	FRR: 0.38022813688212925
Test:
	FAR: 0.31645569620253167
	FRR: 24.778761061946902


Second algorithm - SVC

In [18]:
classifierType_2 = sklearn.svm.SVC
hyperparameters_2 = { 'C' : [ 10, 30, 40, 100, 500, 1000 ], 
                'gamma' : [0.00313, 0.003, 0.0031, 0.0032, 0.029, 0.00328, 0.001, 0.01, 0.001], 
                'kernel' : ['rbf'],
                'class_weight' : ['balanced', None]}

Сначала проверяла по большему разбросу, в итоге сошлаось к таким вариантам как выше  

In [19]:
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType_2, hyperparameters_2, features, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

	The best parametrs are {'C': 30, 'class_weight': None, 'gamma': 0.01, 'kernel': 'rbf'}
Train:
	FAR: 0.0
	FRR: 0.0
Test:
	FAR: 1.5822784810126582
	FRR: 13.274336283185843


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

In [76]:
#Count Vectorizer with multinomial Naive Bayes
def train_2(dataset, labels):  
    #train test split
    train_data, test_data, train_label, test_label = train_test_split(dataset['context'], 
                                                    labels, 
                                                    random_state = 1)
                                                    
    #fitting and transforming train_data using a Count Vectorizer with default parameters
    vect = CountVectorizer().fit(train_data)
    train_data_vectorized = vect.transform(train_data)
    #fitting a multinomial Naive Bayes Classifier Model with smoothing alpha=0.1
    model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)
    model_fit = model.fit(train_data_vectorized, train_label)
    predictions_train = model.predict(vect.transform(train_data))
    trainFAR, trainFRR = get_metrics(train_label, predictions_train)
    predictions = model.predict(vect.transform(test_data))
    testFAR, testFRR = get_metrics(test_label, predictions)
    aucscore = roc_auc_score(test_label, predictions)
    print(aucscore)
    return trainFAR, trainFRR, testFAR, testFRR

In [77]:
trainFAR, trainFRR, testFAR, testFRR = train_2(dataset, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

0.9600713719270421
Train:
	FAR: 0.6321112515802781
	FRR: 0.7168458781362007
Test:
	FAR: 0.7692307692307693
	FRR: 7.216494845360824


In [134]:
#TfidfVectorizer with multinomial Naive Bayes
def train_3(dataset, labels):  
    classifierType = sklearn.naive_bayes.MultinomialNB
    hyperparameters = {'alpha' : list(range(0.008, 0.2, 0.002))}
    #train test split
    train_data, test_data, train_label, test_label = train_test_split(dataset['context'], 
                                                    labels, 
                                                    random_state = 1)
                                                    
    #fitting and transforming train_data using a Count Vectorizer with default parameters
    vect = TfidfVectorizer(min_df = 1).fit(train_data)
    train_data_vectorized = vect.transform(train_data)
    #fitting a multinomial Naive Bayes Classifier Model with smoothing alpha=0.1
    model = GridSearchCV(classifierType(), hyperparameters, n_jobs = -1, refit = 'precision_score')
    model_fit = model.fit(train_data_vectorized, train_label)
    predictions_train = model.predict(vect.transform(train_data))
    trainFAR, trainFRR = get_metrics(train_label, predictions_train)
    predictions = model.predict(vect.transform(test_data))
    testFAR, testFRR = get_metrics(test_label, predictions)
    aucscore = roc_auc_score(test_label, predictions)
    print(aucscore)
    print('\tThe best parametrs are', model_fit.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

In [135]:
trainFAR, trainFRR, testFAR, testFRR = train_3(dataset, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

TypeError: 'float' object cannot be interpreted as an integer

In [111]:
#Count Vectorizer with RandomForestClassifier
def train_4(dataset, labels):  
    classifierType = sklearn.ensemble.RandomForestClassifier
    hyperparameters = {'n_estimators' : list(range(50, 200, 10)),
                'criterion' : ['gini'],
                'max_depth' : [None],
                'min_samples_split' : [2, 3, 4],
                'min_samples_leaf' : [2, 4, 3]}
    #train test split
    train_data, test_data, train_label, test_label = train_test_split(dataset['context'], 
                                                    labels, 
                                                    random_state = 1)
                                                    
    #fitting and transforming train_data using a Count Vectorizer with default parameters
    vect = CountVectorizer().fit(train_data)
    train_data_vectorized = vect.transform(train_data)
    model = GridSearchCV(classifierType(), hyperparameters, n_jobs = -1, refit = 'precision_score')
    model_fit = model.fit(train_data_vectorized, train_label)
    predictions_train = model.predict(vect.transform(train_data))
    trainFAR, trainFRR = get_metrics(train_label, predictions_train)
    predictions = model.predict(vect.transform(test_data))
    testFAR, testFRR = get_metrics(test_label, predictions)
    aucscore = roc_auc_score(test_label, predictions)
    print(aucscore)
    
    print('\tThe best parametrs are', model_fit.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

In [112]:
trainFAR, trainFRR, testFAR, testFRR = train_4(dataset, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

0.8195876288659794
	The best parametrs are {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Train:
	FAR: 0.0
	FRR: 11.469534050179211
Test:
	FAR: 0.0
	FRR: 36.08247422680412


In [116]:
#Count Vectorizer with SVC
def train_5(dataset, labels):  
    classifierType = sklearn.svm.SVC
    hyperparameters_2 = { 'C' : [ 10, 30, 40, 100, 500, 1000 ], 
                'gamma' : [0.00313, 0.003, 0.0031, 0.0032, 0.029, 0.00328, 0.001, 0.01, 0.001], 
                'kernel' : ['rbf'],
                'class_weight' : ['balanced', None]}
    #train test split
    train_data, test_data, train_label, test_label = train_test_split(dataset['context'], 
                                                    labels, 
                                                    random_state = 1)
                                                    
    #fitting and transforming train_data using a Count Vectorizer with default parameters
    vect = CountVectorizer().fit(train_data)
    train_data_vectorized = vect.transform(train_data)
    model = GridSearchCV(classifierType(), hyperparameters_2, n_jobs = -1, refit = 'precision_score')
    model_fit = model.fit(train_data_vectorized, train_label)
    predictions_train = model.predict(vect.transform(train_data))
    trainFAR, trainFRR = get_metrics(train_label, predictions_train)
    predictions = model.predict(vect.transform(test_data))
    testFAR, testFRR = get_metrics(test_label, predictions)
    aucscore = roc_auc_score(test_label, predictions)
    print(aucscore)
    
    print('\tThe best parametrs are', model_fit.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

In [117]:
trainFAR, trainFRR, testFAR, testFRR = train_5(dataset, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

0.9465305313243457
	The best parametrs are {'C': 10, 'class_weight': None, 'gamma': 0.01, 'kernel': 'rbf'}
Train:
	FAR: 0.0
	FRR: 0.7168458781362007
Test:
	FAR: 0.38461538461538464
	FRR: 10.309278350515463


In [120]:
#TfidfVectorizer with SVC
def train_6(dataset, labels):  
    classifierType = sklearn.svm.SVC
    hyperparameters_2 = { 'C' : [ 10, 30, 40, 100, 500, 1000 ], 
                'gamma' : [0.00313, 0.003, 0.0031, 0.0032, 0.029, 0.00328, 0.001, 0.01, 0.001], 
                'kernel' : ['rbf'],
                'class_weight' : ['balanced', None]}
    #train test split
    train_data, test_data, train_label, test_label = train_test_split(dataset['context'], 
                                                    labels, 
                                                    random_state = 1)
                                                    
    #fitting and transforming train_data using a Count Vectorizer with default parameters
    vect = TfidfVectorizer(min_df = 1).fit(train_data)
    train_data_vectorized = vect.transform(train_data)
    model = GridSearchCV(classifierType(), hyperparameters_2, n_jobs = -1, refit = 'precision_score')
    model_fit = model.fit(train_data_vectorized, train_label)
    predictions_train = model.predict(vect.transform(train_data))
    trainFAR, trainFRR = get_metrics(train_label, predictions_train)
    predictions = model.predict(vect.transform(test_data))
    testFAR, testFRR = get_metrics(test_label, predictions)
    aucscore = roc_auc_score(test_label, predictions)
    print(aucscore)
    
    print('\tThe best parametrs are', model_fit.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

In [121]:
trainFAR, trainFRR, testFAR, testFRR = train_6(dataset, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

0.9703806502775575
	The best parametrs are {'C': 1000, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}
Train:
	FAR: 0.0
	FRR: 0.0
Test:
	FAR: 0.7692307692307693
	FRR: 5.154639175257731


In [122]:
#TfidfVectorizer with RandomForestClassifier
def train_7(dataset, labels):  
    classifierType = sklearn.ensemble.RandomForestClassifier
    hyperparameters = {'n_estimators' : list(range(50, 200, 10)),
                'criterion' : ['gini'],
                'max_depth' : [None],
                'min_samples_split' : [2, 3, 4],
                'min_samples_leaf' : [2, 4, 3]}
    #train test split
    train_data, test_data, train_label, test_label = train_test_split(dataset['context'], 
                                                    labels, 
                                                    random_state = 1)
                                                    
    #fitting and transforming train_data using a Count Vectorizer with default parameters
    vect = TfidfVectorizer(min_df = 1).fit(train_data)
    train_data_vectorized = vect.transform(train_data)
    model = GridSearchCV(classifierType(), hyperparameters, n_jobs = -1, refit = 'precision_score')
    model_fit = model.fit(train_data_vectorized, train_label)
    predictions_train = model.predict(vect.transform(train_data))
    trainFAR, trainFRR = get_metrics(train_label, predictions_train)
    predictions = model.predict(vect.transform(test_data))
    testFAR, testFRR = get_metrics(test_label, predictions)
    aucscore = roc_auc_score(test_label, predictions)
    print(aucscore)
    print('\tThe best parametrs are', model_fit.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR

In [123]:
trainFAR, trainFRR, testFAR, testFRR = train_7(dataset, labels)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

0.8195876288659794
	The best parametrs are {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 100}
Train:
	FAR: 0.0
	FRR: 8.960573476702509
Test:
	FAR: 0.0
	FRR: 36.08247422680412
