# Evaluation of Individual Word Risks

In [1]:
import numpy as np
import pandas as pd

# pre-processing
import string
import nltk
from nltk.corpus import stopwords # stopwords

# model loading packages
import pickle
import fasttext
from word2vec import get_embed_features
from simpletransformers.classification import ClassificationModel, ClassificationArgs # bert
from sklearn.metrics import classification_report # bert
from scipy.special import softmax # bert



In [13]:
def pre_processing(text):
    '''
    Accepts a text and processes text
    '''
    # strip accents
    text = text.encode('ascii', 'ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))

    # remove unnecessary white spaces
    text = text.replace("\n", "")

    # tokenize
    text_words = nltk.word_tokenize(text)

    # remove stop words
    stop = list(stopwords.words('english'))
    keep_stopwords = ["no", "not", "nor"]
    for word in keep_stopwords:
        stop.remove(word)
        stop = set(stop)
    text_words = [x for x in text_words if not x in stop]

    return ' '.join(text_words)

In [14]:
# load data
all_train = pd.read_csv('data/all_train.csv', header = 0)[["date_time", "text", "label"]]
all_test = pd.read_csv('data/all_test.csv', header = 0)[["date_time", "text", "label"]]

# pre-process
all_train["text"] = all_train["text"].apply(lambda x: pre_processing(x))
all_test["text"] = all_test["text"].apply(lambda x: pre_processing(x))

# extract unique words
all_words = set()

for row in all_train.text:
    row_words = row.split(" ")
    for word in row_words:
        all_words.add(word)

for row in all_test.text:
    row_words = row.split(" ")
    for word in row_words:
        all_words.add(word)

In [27]:
word_embeddings = get_embed_features(pd.Series(["father recently hacked bittrex account drained bitfinex\
account father apparently head countless warnings enable 2fa bittrex 10k stolen someone hacked account \
looked receiving address etherscan shill address withdrawals another shill address one led bitfinex address \
https etherscan io address 0x876eabf441b2ee5b5b0554fd503412a8e0600950cfa tokentxns bitfinex account support says \
not priority expect get funds back wanted see could get account closed \
considering 15m help appreciated"]))

In [28]:
# load logistic regression model
model_word2vec_lr = pickle.load(open('./models/word2vec/lr.sav', 'rb'))
pred_word2vec_lr = model_word2vec_lr.predict_proba(word_embeddings)
# res_word2vec_lr = pd.DataFrame(pred_word2vec_lr)
# res_word2vec_lr.insert(0, "word", list(all_words))
# res_word2vec_lr.columns = ["word", "prob_0", "prob_1"]
pred_word2vec_lr

array([[0.81022378, 0.18977622]])

In [25]:
word_embeddings = get_embed_features(pd.Series(["father recently hacked \
bittrex account drained bitfinex account father apparently head countless warnings enable 2fa \
bittrex 10k stolen someone hacked account looked receiving address etherscan shill address withdrawals \
another shill address one led bitfinex address https etherscan io address \
tokentxns bitfinex account support says not priority expect get funds back wanted see could get account closed considering 15m help appreciated"]))

In [26]:
model_word2vec_lr = pickle.load(open('./models/word2vec/lr.sav', 'rb'))
pred_word2vec_lr = model_word2vec_lr.predict_proba(word_embeddings)
# res_word2vec_lr = pd.DataFrame(pred_word2vec_lr)
# res_word2vec_lr.insert(0, "word", list(all_words))
# res_word2vec_lr.columns = ["word", "prob_0", "prob_1"]
pred_word2vec_lr

array([[0.8150797, 0.1849203]])

## Word2Vec

In [4]:
# generate word embeddings
word_embeddings = get_embed_features(pd.Series(list(all_words)))

### Logistic Regression

In [5]:
# load logistic regression model
model_word2vec_lr = pickle.load(open('./models/word2vec/lr.sav', 'rb'))
pred_word2vec_lr = model_word2vec_lr.predict_proba(word_embeddings)
res_word2vec_lr = pd.DataFrame(pred_word2vec_lr)
res_word2vec_lr.insert(0, "word", list(all_words))
res_word2vec_lr.columns = ["word", "prob_0", "prob_1"]

In [6]:
res_word2vec_lr.sort_values(by=["prob_1"], ascending=False).head(20)

Unnamed: 0,word,prob_0,prob_1
5359,allegedly,2.3e-05,0.999977
1451,ss,4e-05,0.99996
10325,unidentified,5.3e-05,0.999947
12864,arrested,6.2e-05,0.999938
19248,unknown,8.3e-05,0.999917
6256,misconfigured,9.8e-05,0.999902
8513,raided,0.000116,0.999884
15370,indicted,0.000159,0.999841
16311,trades,0.000161,0.999839
14836,otc,0.000163,0.999837


In [7]:
# save results
res_word2vec_lr.to_csv('data/evaluation_wordrisk/word2vec_lr.csv', header=True, index=False)

### SVM

In [8]:
# load svm model
model_word2vec_svm = pickle.load(open('./models/word2vec/svm.txt', 'rb'))
pred_word2vec_svm = model_word2vec_svm.predict_proba(word_embeddings)
res_word2vec_svm = pd.DataFrame(pred_word2vec_svm)
res_word2vec_svm.insert(0, "word", list(all_words))
res_word2vec_svm.columns = ["word", "prob_0", "prob_1"]

In [9]:
res_word2vec_svm.sort_values(by=["prob_1"], ascending=False).head(20)

Unnamed: 0,word,prob_0,prob_1
11232,hacked,0.077956,0.922044
16594,20hacked,0.077956,0.922044
2365,hack1,0.333951,0.666049
9251,hack3,0.333951,0.666049
13740,hack4,0.333951,0.666049
5395,hack7,0.333951,0.666049
85,hack,0.333951,0.666049
6535,downton,0.460968,0.539032
7420,apparently,0.522425,0.477575
15195,siberian,0.530633,0.469367


In [10]:
# save results
res_word2vec_svm.to_csv('data/evaluation_wordrisk/word2vec_svm.csv', header=True, index=False)

### Random Forest

In [11]:
# load random forest model
model_word2vec_rf = pickle.load(open('./models/word2vec/rf.sav', 'rb'))
pred_word2vec_rf = model_word2vec_rf.predict_proba(word_embeddings)
res_word2vec_rf = pd.DataFrame(pred_word2vec_rf)
res_word2vec_rf.insert(0, "word", list(all_words))
res_word2vec_rf.columns = ["word", "prob_0", "prob_1"]

In [12]:
res_word2vec_rf.sort_values(by=["prob_1"], ascending=False).head(20)

Unnamed: 0,word,prob_0,prob_1
16594,20hacked,0.058455,0.941545
11232,hacked,0.058455,0.941545
10585,cusp,0.303333,0.696667
8194,crystal,0.33,0.67
5395,hack7,0.349131,0.650869
85,hack,0.349131,0.650869
2365,hack1,0.349131,0.650869
9251,hack3,0.349131,0.650869
13740,hack4,0.349131,0.650869
7541,healing,0.35,0.65


In [13]:
# save results
res_word2vec_rf.to_csv('data/evaluation_wordrisk/word2vec_rf.csv', header=True, index=False)

## Bert Models
### Bert

In [14]:
# load bert model
model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
model_bert = ClassificationModel(model_type = 'bert', model_name = 'models/bert/outputs_bert_base_cased/', args = model_args, use_cuda=False)

In [15]:
pred_bert, raw_output_bert = model_bert.predict(pd.Series(list(all_words)))

100%|██████████| 20270/20270 [00:10&lt;00:00, 1906.22it/s]
100%|██████████| 2534/2534 [1:24:58&lt;00:00,  2.01s/it]


In [29]:
probabilties_bert = softmax(raw_output_bert, axis=1)
res_bert = pd.DataFrame(probabilties_bert)
res_bert.insert(0, "word", list(all_words))
res_bert.columns = ["word", "prob_0", "prob_1"]

In [30]:
# save results
res_bert.to_csv('data/evaluation_wordrisk/bert.csv', header=True, index=False)

In [31]:
res_bert.sort_values(by=["prob_1"], ascending=False).head(20)

Unnamed: 0,word,prob_0,prob_1
20098,hackedi,0.003328,0.996672
12957,hackedall,0.003756,0.996244
8513,raided,0.005244,0.994756
6484,smashed,0.005429,0.994571
1687,destroyed,0.005519,0.994481
5110,damaged,0.005533,0.994467
10424,bullied,0.005691,0.994309
5048,undermanned,0.005708,0.994292
19593,harassed,0.005819,0.994181
12253,damaging,0.006151,0.993849


### Roberta

In [18]:
# load roberta model
model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
model_roberta = ClassificationModel(model_type = 'roberta', model_name = 'models/bert/outputs_roberta_base/', args = model_args, use_cuda=False)

In [19]:
pred_roberta, raw_output_roberta = model_roberta.predict(pd.Series(list(all_words)))

100%|██████████| 20270/20270 [00:07&lt;00:00, 2691.64it/s]
100%|██████████| 2534/2534 [1:10:55&lt;00:00,  1.68s/it]


In [32]:
probabilties_roberta = softmax(raw_output_bert, axis=1)
res_roberta = pd.DataFrame(probabilties_roberta)
res_roberta.insert(0, "word", list(all_words))
res_roberta.columns = ["word", "prob_0", "prob_1"]

In [33]:
# save results
res_roberta.to_csv('data/evaluation_wordrisk/roberta.csv', header=True, index=False)

In [34]:
res_roberta.sort_values(by=["prob_1"], ascending=False).head(20)

Unnamed: 0,word,prob_0,prob_1
20098,hackedi,0.003328,0.996672
12957,hackedall,0.003756,0.996244
8513,raided,0.005244,0.994756
6484,smashed,0.005429,0.994571
1687,destroyed,0.005519,0.994481
5110,damaged,0.005533,0.994467
10424,bullied,0.005691,0.994309
5048,undermanned,0.005708,0.994292
19593,harassed,0.005819,0.994181
12253,damaging,0.006151,0.993849
