In [19]:


import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression


In [20]:
DATA_PATH = "data.txt"
COLUMN_LABEL = "class"
COLUMN_TEXT = "context"

LABEL_LEGIT = 'LEGI'
LABEL_SPAM = 'SPAM'
LABEL_SMISHING = 'SMIS'

dataset = pd.read_csv(DATA_PATH, sep='\t', names=[COLUMN_LABEL, COLUMN_TEXT], header=None)
print('Total size:', dataset.shape[0])
print('Legit messages:', dataset[dataset[COLUMN_LABEL] == LABEL_LEGIT].shape[0])
print('Spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

dataset = dataset[((dataset[COLUMN_LABEL] == LABEL_LEGIT) | (dataset[COLUMN_LABEL] == LABEL_SMISHING))]

def messages2vectors(messages):

    elmo = hub.Module("https://tfhub.dev/google/elmo/1")
    features = np.zeros((0, 1024))
    n = 100
    l = int(len(messages) / n) if len(messages) % n == 0 else int(len(messages) / n) + 1

    for i in range(l):

        if (i + 1) * n < len(messages):
            right = (i + 1) * n
            embedds = elmo(messages[int(i * n) : right], signature="default", as_dict=True)["default"] 
        else:
            embedds = elmo(messages[:len(messages) - int(i * n)], signature="default", as_dict=True)["default"] 

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            embedds = sess.run(embedds)
            features = np.concatenate([features, embedds])

    return features

def convert_labels(labels_raw):

    labels = np.array([(0 if i=="LEGI" else 1) for i in labels_raw ])
    return labels

labels = convert_labels(dataset[COLUMN_LABEL])

Total size: 1540
Legit messages: 1050
Spam messages: 190
Smishing messages: 300


In [9]:
features = messages2vectors(dataset[COLUMN_TEXT])
print(features.shape)
print(labels.shape)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver

In [21]:
def split_data(features, labels, ratio=0.7):


    positive_data = features[labels == 1] 
    negative_data = features[labels == 0] 

   
    random_indecies_positive = np.arange(positive_data.shape[0])
    np.random.shuffle(random_indecies_positive)
    random_indecies_negative = np.arange(negative_data.shape[0])
    np.random.shuffle(random_indecies_negative)

    n_positive_train = int(positive_data.shape[0] * ratio)
    n_negative_train = int(negative_data.shape[0] * ratio)

    train_data = np.concatenate([positive_data[random_indecies_positive[:n_positive_train]], 
                                negative_data[random_indecies_negative[:n_negative_train]]])
    
    train_labels = np.asarray([1] * n_positive_train + [0] * n_negative_train)

    test_data = np.concatenate([positive_data[random_indecies_positive[n_positive_train:]], 
                                negative_data[random_indecies_negative[n_negative_train:]]])

    test_labels = np.asarray([1] * (positive_data.shape[0]  - n_positive_train) + [0] * (negative_data.shape[0] - n_negative_train))

    return train_data, train_labels, test_data, test_labels
def get_metrics(labels, predictions):

    cf = confusion_matrix(labels, predictions)
    FAR = cf[0][1]/(cf[0][1] + cf[0][0])
    FRR = cf[1][0]/(cf[1][0] + cf[1][1])
    return FAR*100, FRR*100


classifierType = [RandomForestClassifier, MultinomialNB,SVC]

def evaluate(classifierType, hyperparameters, features, labels,hp , cl = "not NB"): 

  
    train_data, train_labels, test_data, test_labels = split_data(features, labels, ratio=0.7)

    print('Train set shape:', train_data.shape)
    print('Train labels shape:', train_labels.shape)
    print('Test set shape:', test_data.shape)
    print('Test labels shape:', test_labels.shape)
    
    model  = GridSearchCV(classifierType(**hp),hyperparameters, n_jobs = -1,refit = "presicion_score")
   
    clf  = model.fit(train_data,train_labels)
   
    predictions_train = clf.predict(train_data)
    
    trainFAR, trainFRR = get_metrics(train_labels, predictions_train)

    predictions_test = clf.predict(test_data)

    testFAR, testFRR = get_metrics(test_labels,predictions_test)
    print("\tbest params are ",clf.best_params_)
    return trainFAR, trainFRR, testFAR, testFRR



#------------------------------------------------------------------------

    

In [5]:
print("   RFC :")
hp = {'n_estimators' : 70,
                'criterion' : 'gini',
                'max_depth' : None,
                'min_samples_split' : 2,
                'n_jobs' :-1}



hyperparameters = {'n_estimators' : list(range(75,200,5)),
                'criterion' : ['gini'],
                'max_depth' : [None],
                'min_samples_split' : [1,2,3,5]
                }
            
'''
#after a few sugestions with gridsearch e.g {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 5, 'n_estimators': 95}

hyperparameters = {'n_estimators' : [96,98,100,105,120,114,135,111,95],
                'criterion' : ['gini'],
                'max_depth' : [None],
                'min_samples_split' : [5]
                }
'''
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType[0], hyperparameters, features, labels,hp)
print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

   RFC :
Train set shape: (945, 1024)
Train labels shape: (945,)
Test set shape: (405, 1024)
Test labels shape: (405,)
	best params are  {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 85}
Train:
	FAR: 0.27210884353741494
	FRR: 10.952380952380953
Test:
	FAR: 4.761904761904762
	FRR: 18.88888888888889


In [11]:
print("   SVM: ")
hyperparameters=  [ 
                   {
                      'C': [10,20,15,5,9,13,25,50, 100,500, 1000], 
                      'gamma': [0.001,0.00075,0.0009,0.0005,0.0008,0.0015,0.0017,0.00173,0.0016,0.012,0.0175,0.002,0.0025,0.0015,0.003], 
                      'kernel': ['rbf']
                    }]
              
hp =    {             'C': 1, 
                      'kernel': 'linear'
                    }
trainFAR, trainFRR, testFAR, testFRR = evaluate(classifierType[2], hyperparameters, features, labels,hp)

print('Train:')
print('\tFAR:', trainFAR)
print('\tFRR:', trainFRR)

print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)

   SVM: 
Train set shape: (945, 1024)
Train labels shape: (945,)
Test set shape: (405, 1024)
Test labels shape: (405,)
	best params are  {'C': 20, 'gamma': 0.0015, 'kernel': 'rbf'}
Train:
	FAR: 0.27210884353741494
	FRR: 13.80952380952381
Test:
	FAR: 1.9047619047619049
	FRR: 17.77777777777778


In [22]:
#new data preprocessing
#new "split" for new data preprocessing !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

X_train, X_test, y_train, y_test = train_test_split(dataset['context'], 
                                                    labels, 
                                                    test_size=0.2,
                                                     random_state=42
                                                    )
print("train length: ", len(X_train))
print("test length: ", len(X_test))

train length:  1080
test length:  270


In [23]:
print(" NB (CountVectorizer)")

vect = CountVectorizer(ngram_range=[1,1]).fit(X_train)#it shows score 0.996 with 1-gram
X_train_vectorized = vect.transform(X_train)

model = MultinomialNB(alpha=0.1)
model_fit = model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)

tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')



#apperantly it worked as good as logistic regression 

print("SVC n-grams")
clfsvc = SVC(**{'C': 10, 'gamma': 0.002, 'kernel': 'rbf'})
clfsvc.fit(X_train_vectorized, y_train)
predictions = clfsvc.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)


 NB (CountVectorizer)
Test:
	FAR: 0.9615384615384616
	FRR: 0.0
 general score: 0.9951923076923077

True Positives: 62
False Positives: 2
True Negatives: 206
False Negatives: 0
SVC n-grams
Test:
	FAR: 1.4423076923076923
	FRR: 6.451612903225806
 general score: 0.9605303970223326


In [24]:
print(" NB (TfidfVectorizer)")
vect = TfidfVectorizer(min_df=2).fit(X_train)
X_train_vectorized = vect.transform(X_train)
model = sklearn.naive_bayes.MultinomialNB()
model_fit = model.fit(X_train_vectorized, y_train)

predictions = model_fit.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
#proof
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')





 NB (TfidfVectorizer)
Test:
	FAR: 0.0
	FRR: 14.516129032258066

True Positives: 53
False Positives: 0
True Negatives: 208
False Negatives: 9


In [25]:
#usage of the n-grams was the best intuitive idea due to the short form of masseges, adress form of text which can be 
# subjectively divided only with more then one word (click on,  you should, your bank..., the ... (talking about 3rd objects in 
# dialogue), ) but it didn't worked that good with nb. Everything got better when it's a weight-length part added to the features
def add_feature(X, feature_to_add):   
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')
print("LogisticRegression!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
vectorizer = TfidfVectorizer(min_df=2, ngram_range=[1,2])#it shows score <0.9 with 1-gram and 0.91 with adding 3-gram

X_train_transformed = vectorizer.fit_transform(X_train)
X_train_transformed_with_length = add_feature(X_train_transformed, [X_train.str.len(),
                                                                    X_train.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

X_test_transformed = vectorizer.transform(X_test)
X_test_transformed_with_length = add_feature(X_test_transformed, [X_test.str.len(),
                                                                  X_test.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

clf = LogisticRegression(penalty = "l2",C=100,solver = "lbfgs")

clf.fit(X_train_transformed_with_length, y_train)

predictions = clf.predict(X_test_transformed_with_length)
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)
print("NB with n-grams :")
clf_main = MultinomialNB(alpha=0.1)
clf_main.fit(X_train_transformed_with_length, y_train)

predictions = clf_main.predict(X_test_transformed_with_length)
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)


LogisticRegression!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Test:
	FAR: 0.0
	FRR: 0.0
 general score: 1.0
NB with n-grams :
Test:
	FAR: 0.4807692307692308
	FRR: 6.451612903225806
