In [1]:


import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression


In [2]:
DATA_PATH = "data.txt"
COLUMN_LABEL = "class"
COLUMN_TEXT = "context"

LABEL_LEGIT = 'LEGI'
LABEL_SPAM = 'SPAM'
LABEL_SMISHING = 'SMIS'

dataset = pd.read_csv(DATA_PATH, sep='\t', names=[COLUMN_LABEL, COLUMN_TEXT], header=None)
print('Total size:', dataset.shape[0])
print('Legit messages:', dataset[dataset[COLUMN_LABEL] == LABEL_LEGIT].shape[0])
print('Spam messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SPAM].shape[0])
print('Smishing messages:', dataset[dataset[COLUMN_LABEL] == LABEL_SMISHING].shape[0])

dataset = dataset[((dataset[COLUMN_LABEL] == LABEL_LEGIT) | (dataset[COLUMN_LABEL] == LABEL_SMISHING))]



def convert_labels(labels_raw):

    labels = np.array([(0 if i=="LEGI" else 1) for i in labels_raw ])
    return labels

labels = convert_labels(dataset[COLUMN_LABEL])

Total size: 1540
Legit messages: 1050
Spam messages: 190
Smishing messages: 300


In [3]:
#new data preprocessing
#new "split" for new data preprocessing !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

X_train, X_test, y_train, y_test = train_test_split(dataset['context'], 
                                                    labels, 
                                                    test_size=0.2,
                                                     random_state=42
                                                    )

print("train length: ", len(X_train))
print("test length: ", len(X_test))

train length:  1080
test length:  270


In [4]:


def get_metrics(labels, predictions):

    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    FAR = fn/(fn+tp)
    FRR = fp/(fp+tn)
    return FAR*100, FRR*100



    

In [11]:
print(" NB (CountVectorizer)")

vect = CountVectorizer(ngram_range=[1,2]).fit(X_train)#it shows score 0.996 with 1-gram
X_train_vectorized = vect.transform(X_train)

model = MultinomialNB(alpha=0.1)
model_fit = model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)

tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')



#apperantly it worked as good as logistic regression 

print("\nSVC n-grams")
clfsvc = SVC(**{'C': 10, 'gamma': 0.002, 'kernel': 'rbf'})
clfsvc.fit(X_train_vectorized, y_train)
predictions = clfsvc.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)


print("\nRFC n-grams")
clfrfc = RandomForestClassifier(max_depth = None, n_estimators = 160)
clfrfc.fit(X_train_vectorized, y_train)
predictions = clfrfc.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)


 NB (CountVectorizer)
Test:
	FAR: 0.0
	FRR: 0.0
 general score: 1.0

True Positives: 62
False Positives: 0
True Negatives: 208
False Negatives: 0

SVC n-grams
Test:
	FAR: 1.6129032258064515
	FRR: 0.0
 general score: 0.9919354838709677

RFC n-grams
Test:
	FAR: 9.67741935483871
	FRR: 0.0
 general score: 0.9516129032258065


In [12]:
print(" NB (TfidfVectorizer)")
vect = TfidfVectorizer(min_df=2).fit(X_train)
X_train_vectorized = vect.transform(X_train)
model = sklearn.naive_bayes.MultinomialNB()
model_fit = model.fit(X_train_vectorized, y_train)

predictions = model_fit.predict(vect.transform(X_test))
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
#proof
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f'\nTrue Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')





 NB (TfidfVectorizer)
Test:
	FAR: 14.516129032258066
	FRR: 0.0

True Positives: 53
False Positives: 0
True Negatives: 208
False Negatives: 9


In [13]:
#usage of the n-grams was the best intuitive idea due to the short form of masseges, adress form of text which can be 
# subjectively divided only with more then one word (click on,  you should, your bank..., the ... (talking about 3rd objects in 
# dialogue), ) but it didn't worked that good with nb. Everything got better when it's a weight-length part added to the features
def add_feature(X, feature_to_add):   
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')
print("LogisticRegression!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
vectorizer = TfidfVectorizer(min_df=2, ngram_range=[1,2])#it shows score <0.9 with 1-gram and 0.91 with adding 3-gram

X_train_transformed = vectorizer.fit_transform(X_train)
X_train_transformed_with_length = add_feature(X_train_transformed, [X_train.str.len(),
                                                                    X_train.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

X_test_transformed = vectorizer.transform(X_test)
X_test_transformed_with_length = add_feature(X_test_transformed, [X_test.str.len(),
                                                                  X_test.apply(lambda x: len(''.join([a for a in x if a.isdigit()])))])

clf = LogisticRegression(penalty = "l2",C=100,solver = "lbfgs")

clf.fit(X_train_transformed_with_length, y_train)

predictions = clf.predict(X_test_transformed_with_length)
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)
aucscore = roc_auc_score(y_test, predictions)

print(" general score:", aucscore)
print("\nNB with n-grams :")
clf_main = MultinomialNB(alpha=0.1)
clf_main.fit(X_train_transformed_with_length, y_train)

predictions = clf_main.predict(X_test_transformed_with_length)
testFAR, testFRR = get_metrics(y_test, predictions)
print('Test:')
print('\tFAR:', testFAR)
print('\tFRR:', testFRR)


LogisticRegression!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Test:
	FAR: 0.0
	FRR: 0.0
 general score: 1.0

NB with n-grams :
Test:
	FAR: 6.451612903225806
	FRR: 0.4807692307692308
