# Evaluation of Text Processing Functions

In [1]:
import numpy as np
import pandas as pd

# pre-processing
import re
import string
import nltk
from nltk.corpus import stopwords # stopwords
from nltk.stem import WordNetLemmatizer # word lemmatizer
from nltk.stem.snowball import SnowballStemmer # stemmer

# model loading packages (for testing)
import pickle
import fasttext
from word2vec import get_embed_features
from simpletransformers.classification import ClassificationModel, ClassificationArgs # bert
from sklearn.metrics import classification_report # bert
from scipy.special import softmax # bert



In [2]:
def is_alphanumeric(text):
    return any(char.isdigit() for char in text) and any(char.isalpha() for char in text)

def is_transaction_hash(text):
    return is_alphanumeric(text) and len(text) > 20

def text_processing(text, 
                    lower=True, 
                    remove_url=True, 
                    remove_punctuation=True, 
                    remove_stopwords=False, 
                    replace_entity=False, 
                    replace_hash=False,
                    split_alphanumeric=False,
                    lemmatize=False,
                    stem=False):
    '''
    Accepts a text and options to run the following processing functions:
    '''
    # strip non-ascii characters
    text = text.encode('ascii', errors='ignore')
    text = str(text.decode("utf-8"))

    # covert to lowercase
    if lower:
        text = text.lower()

    # remove url 
    if remove_url:
        text = re.sub(r'http\S+', '', text)
    
    # remove punctuation
    if remove_punctuation:
        text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        
    # remove unnecessary new lines and whitespaces
    text = text.replace("\n", "") 
    text = ' '.join(text.split())
    
    # tokenize
    text_words = nltk.word_tokenize(text)

    # lemmatize
    if lemmatize:
        wordnet_lemmatizer = WordNetLemmatizer()
        text_words = [wordnet_lemmatizer.lemmatize(x, pos="v") for x in text_words]

    # stem
    if stem:
        stemmer = SnowballStemmer("english")
        text_words = [stemmer.stem(x) for x in text_words]

    # remove stop words
    if remove_stopwords:
        stop = list(stopwords.words('english'))
        keep_stopwords = ["no", "not", "nor"]
        for word in keep_stopwords:
            stop.remove(word)
            stop = set(stop)
        text_words = [x for x in text_words if not x.lower() in stop]
    
    # replace entity
    entity_list = set(pd.read_csv("data/entity_list.csv", header=0)["entity"])
    entity_list = set(x.lower() for x in entity_list) # convert to lowercase
    if replace_entity:
        text_words = [x if not (x.lower() in entity_list) else "entity" for x in text_words]
        
    # replace transaction hashes
    if replace_hash:
        text_words = [x if not is_transaction_hash(x) else "hash" for x in text_words]
    
    # split alphanumeric numbers
    updated_words = []
    if split_alphanumeric:
        for word in text_words:
            if (word != "2fa") and is_alphanumeric(word) and (word[0].isdigit() or word[-1].isdigit()): # first or last is digit
                new_word = re.findall('\d+|\D+', word)
                updated_words.extend(new_word)
            else:
                updated_words.append(word)
        text_words = updated_words
    
    return ' '.join(text_words)

In [9]:
# load data
all_train = pd.read_csv('data/all_train.csv', header = 0)[["date_time", "text", "label"]]
all_test = pd.read_csv('data/all_test.csv', header = 0)[["date_time", "text", "label"]]

In [10]:
mask = all_train["label"] == 1

In [11]:
mask

0       False
1       False
2       False
3       False
4       False
        ...  
4288    False
4289    False
4290    False
4291    False
4292    False
Name: label, Length: 4293, dtype: bool

In [12]:
all_train[list(mask)]

Unnamed: 0,date_time,text,label
8,2018-01-04 00:00:00,Bittrex Wallets Are Taken Offline as Companies...,1
10,2018-01-04 10:56:00,all bittrex withdrawals disabled after breach:...,1
12,2018-01-05 00:00:00,Mt. Gox Bitcoin Missing In relation to Mysteri...,1
14,2018-01-05 19:34:00,beware of fake website as first search result/...,1
15,2018-01-05 23:02:00,warning: don't use bittrex and pull your money...,1
...,...,...,...
4168,2019-05-12 01:07:00,investigating the $40m binance hack.,1
4170,2019-05-12 01:54:00,"âwe got itâ, says binance ceo on stolen $4...",1
4173,2019-05-12 02:19:00,"crypto tidbits: binance bitcoin hack, buffett ...",1
4174,2019-05-12 02:33:00,"âwe got itâ, says binance ceo on stolen $4...",1


## Check: Transaction Hashes
Check effect of changing transaction hashes on prediction.

#### Conclusion
- Only small difference when we replace transaction hashes
- Discovered that text processing will need to include removal of urls (else removing punctuation will make https: link look like a transaction hash)

In [None]:
# pre-process
all_train["text_processed"] = all_train["text"].apply(lambda x: text_processing(x, replace_hash=False))

In [None]:
trans_hashes = ["0xd0206f494e1834a0ad76b202bebfc916317884cb", "0x57bc5eb69c380df5b0b808c64833e53f50cbd2ae", 
 "0x7a0d08c2a22178b3d5693b314c2fa9773e1bb1d0", "18pdbf6u2pbbrjygby5txlxmwpdmmrbvsg", 
                "1aztusl2djz3skh5xxyftdv6pmchrv1awinot"]

pre_text = "give away 5000 eth from bittrex scam wallet "
post_text = " people need to get back 10x what they send i never had a problem can somebody tell me what \
is going on thanks"

text = [pre_text + x + post_text for x in trans_hashes]

### Word2Vec

In [None]:
word_embeddings = get_embed_features(pd.Series(text))

# load logistic regression model
model_word2vec_lr = pickle.load(open('./models/word2vec/lr.sav', 'rb'))
pred_word2vec_lr = model_word2vec_lr.predict_proba(word_embeddings)
res_word2vec_lr = pd.DataFrame(pred_word2vec_lr)
res_word2vec_lr.insert(0, "word", trans_hashes)
res_word2vec_lr.columns = ["word", "prob_0", "prob_1"]

res_word2vec_lr

In [None]:
# load svm model
model_word2vec_svm = pickle.load(open('./models/word2vec/svm.txt', 'rb'))
pred_word2vec_svm = model_word2vec_svm.predict_proba(word_embeddings)
res_word2vec_svm = pd.DataFrame(pred_word2vec_svm)
res_word2vec_svm.insert(0, "word", trans_hashes)
res_word2vec_svm.columns = ["word", "prob_0", "prob_1"]

res_word2vec_svm

### Bert

In [None]:
# load bert model
model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
model_bert = ClassificationModel(model_type = 'bert', model_name = 'models/bert/outputs_bert_base_cased/', args = model_args, use_cuda=False)

In [None]:
pred_bert, raw_output_bert = model_bert.predict(pd.Series(text))
probabilties_bert = softmax(raw_output_bert, axis=1)
res_bert = pd.DataFrame(probabilties_bert)
res_bert.insert(0, "word", trans_hashes)
res_bert.columns = ["word", "prob_0", "prob_1"]

res_bert

### Roberta

In [None]:
# load roberta model
model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
model_roberta = ClassificationModel(model_type = 'roberta', model_name = 'models/bert/outputs_roberta_base/', args = model_args, use_cuda=False)

In [None]:
pred_roberta, raw_output_roberta = model_roberta.predict(pd.Series(text))
probabilties_roberta = softmax(raw_output_roberta, axis=1)
res_roberta = pd.DataFrame(probabilties_roberta)
res_roberta.insert(0, "word", trans_hashes)
res_roberta.columns = ["word", "prob_0", "prob_1"]

res_roberta

## Check: Joint Digits and Words
Check if model predicts differently for "40 million" and "40million" (joined).

In [None]:
digits = ["3k", "3 k", "40million", "40 million", "40mln", "40 mln"]
pre_text = "cryptopia pump and freeze scams they did it with kayicoin pumped it "
post_text = " and froze the exchange now they do it with paccoin pump"
text = [pre_text + x + post_text for x in digits]

### Word2Vec

In [None]:
word_embeddings = get_embed_features(pd.Series(text))

# load logistic regression model
model_word2vec_lr = pickle.load(open('./models/word2vec/lr.sav', 'rb'))
pred_word2vec_lr = model_word2vec_lr.predict_proba(word_embeddings)
res_word2vec_lr = pd.DataFrame(pred_word2vec_lr)
res_word2vec_lr.insert(0, "word", digits)
res_word2vec_lr.columns = ["word", "prob_0", "prob_1"]

res_word2vec_lr

### Bert

In [None]:
pred_bert, raw_output_bert = model_bert.predict(pd.Series(text))
probabilties_bert = softmax(raw_output_bert, axis=1)
res_bert = pd.DataFrame(probabilties_bert)
res_bert.insert(0, "word", digits)
res_bert.columns = ["word", "prob_0", "prob_1"]

res_bert

### Roberta

In [None]:
pred_roberta, raw_output_roberta = model_roberta.predict(pd.Series(text))
probabilties_roberta = softmax(raw_output_roberta, axis=1)
res_roberta = pd.DataFrame(probabilties_roberta)
res_roberta.insert(0, "word", digits)
res_roberta.columns = ["word", "prob_0", "prob_1"]

res_roberta