In [1]:
import pandas as pd
import numpy as np

from numpy.random import seed
import tensorflow as tf
import random as rn
import os
from nltk import tokenize 
import nltk
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
nltk.download('punkt')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
CODES_PATH = 'drive/MyDrive/Colab Notebooks/IRTG/Smart_contracts_paper/USC/SC-classification'
CV_PATH    = 'drive/MyDrive/Colab Notebooks/IRTG/Smart_contracts_paper/USC/SC-classification/models/cv'
TEST_PATH  = 'drive/MyDrive/Colab Notebooks/IRTG/Smart_contracts_paper/USC/SC-classification/models/final'

In [3]:
max_features = 7000
maxlen       = 2000
dropout_rate = 0.25
rs           = 42
epochs       = 4
batch_size   = 256
embed_dim    = 50
rec_units    = 150


seed(rs)
rn.seed(rs)

os.environ['PYTHONHASHSEED']=str(rs)

In [4]:
import sys
sys.path.append(f'{CODES_PATH}')

In [5]:
if tf.test.is_gpu_available():
  import models
else:
  print('You need the GPU support for these calculations')

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def other(x):
    if ((x=='exchanges')|(x=='finance')|(x=='games')|(x=='gambling')|(x=='high-risk')): 
        return x 
    else: 
        return 'other'

In [7]:
data = pd.read_csv("drive/MyDrive/Colab Notebooks/IRTG/Smart_contracts_paper/USC/SC-Dapp-scraping/data/dapps_source_code_parsed_open_source_2021-01-08.csv")

data = data[data.SC_full!='\n']
data.drop('Unnamed: 0', axis=1, inplace=True)
data.category = data.category.apply(other)
data.comments.fillna('empty', inplace=True)

In [8]:
RS = 42
SPLITS = 3
LENGTH = 5000

In [9]:
X = data.drop('category', axis=1)
y = data.category

In [10]:
X, X_test, y, y_test  = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RS)

In [11]:
print(f'X_train shape {X.shape}')
print(f'X_test shape {X_test.shape}')

X_train shape (1142, 12)
X_test shape (286, 12)


## FULL

In [12]:
names  = [f'aucroc_{i}' for i in ['exchanges', 'finance', 'gambling', 'games', 'high-risk']]
names2 = [f'aucprc_{i}' for i in ['exchanges', 'finance', 'gambling', 'games', 'high-risk']]
names.extend(names2)

kf  = StratifiedKFold(n_splits=SPLITS, random_state=RS, shuffle=True)
TRAIN = True

In [13]:
n_words  = range(500, 1200, 200)
length   = range(500, 1200, 200)
emb_dim  = [50, 100]
n_hidden = [50, 100]
epochs   = [4]
lr       = [0.01, 0.001, 0.0001]
drop     = [0, 0.25]


n_words  = [1100]
length   = [500]
emb_dim  = [50]        
n_hidden = [100]      
drop     = [0]      
epochs   = [4]
lr       = [0.01]

### BGRU

In [14]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['SC_full'], X.iloc[val_index]['SC_full']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='BGRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/BGRU_full_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/BGRU_full_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [15]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.915,0.73,0.668,0.774,0.691,0.912,0.494,0.079,0.424,0.117
1,0.924,0.829,0.685,0.83,0.82,0.922,0.691,0.08,0.425,0.147
2,0.949,0.82,0.735,0.84,0.766,0.945,0.656,0.098,0.541,0.142


In [16]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['SC_full'], X_test['SC_full']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='BGRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/BGRU_full_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/BGRU_full_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [17]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.97,0.848,0.767,0.795,0.86,0.963,0.699,0.149,0.422,0.256


### GRU

In [18]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['SC_full'], X.iloc[val_index]['SC_full']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='GRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/GRU_full_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/GRU_full_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [19]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.917,0.708,0.776,0.794,0.8,0.914,0.47,0.185,0.47,0.17
1,0.932,0.769,0.695,0.725,0.737,0.934,0.594,0.082,0.37,0.091
2,0.955,0.744,0.649,0.79,0.644,0.946,0.531,0.075,0.491,0.104


In [20]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['SC_full'], X_test['SC_full']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='GRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/GRU_full_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/GRU_full_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [21]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.966,0.712,0.749,0.818,0.897,0.956,0.443,0.102,0.442,0.486


### CNN

In [22]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['SC_full'], X.iloc[val_index]['SC_full']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='CNN', max_features=WORD, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/CNN_full_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/CNN_full_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [23]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.963,0.895,0.744,0.882,0.783,0.956,0.778,0.11,0.699,0.188
1,0.968,0.897,0.747,0.872,0.801,0.961,0.811,0.139,0.625,0.173
2,0.96,0.927,0.773,0.874,0.861,0.956,0.83,0.142,0.673,0.169


In [24]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['SC_full'], X_test['SC_full']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='CNN', max_features=WORD, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/CNN_full_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/CNN_full_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [25]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.972,0.95,0.804,0.908,0.84,0.972,0.88,0.155,0.672,0.482


## Only The Source Code

In [26]:
names  = [f'aucroc_{i}' for i in ['exchanges', 'finance', 'gambling', 'games', 'high-risk']]
names2 = [f'aucprc_{i}' for i in ['exchanges', 'finance', 'gambling', 'games', 'high-risk']]
names.extend(names2)

kf  = StratifiedKFold(n_splits=SPLITS, random_state=RS, shuffle=True)
TRAIN = True

In [27]:
n_words  = range(500, 1200, 200)
length   = range(500, 1200, 200)
emb_dim  = [50, 100]
n_hidden = [50, 100]
epochs   = [4]
lr       = [0.01, 0.001, 0.0001]
drop     = [0, 0.25]


n_words  = [1100]
length   = [500]
emb_dim  = [50]        
n_hidden = [100]      
drop     = [0]      
epochs   = [4]
lr       = [0.01]

### BGRU

In [28]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['SC_no_comments'], X.iloc[val_index]['SC_no_comments']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='BGRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/BGRU_onlycode_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/BGRU_onlycode_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [29]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.92,0.71,0.734,0.806,0.771,0.915,0.351,0.1,0.429,0.34
1,0.96,0.743,0.656,0.759,0.8,0.953,0.441,0.096,0.364,0.157
2,0.959,0.688,0.628,0.842,0.781,0.954,0.353,0.066,0.521,0.218


In [30]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['SC_no_comments'], X_test['SC_no_comments']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='BGRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/BGRU_onlycode_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/BGRU_onlycode_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [31]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.946,0.282,0.461,0.744,0.763,0.952,0.157,0.049,0.448,0.508


### GRU

In [32]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['SC_no_comments'], X.iloc[val_index]['SC_no_comments']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='GRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/GRU_onlycode_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/GRU_onlycode_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [33]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.902,0.757,0.751,0.739,0.8,0.897,0.589,0.155,0.415,0.166
1,0.922,0.791,0.76,0.784,0.74,0.925,0.616,0.153,0.453,0.151
2,0.947,0.851,0.702,0.817,0.743,0.933,0.684,0.146,0.5,0.121


In [34]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['SC_no_comments'], X_test['SC_no_comments']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='GRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/GRU_onlycode_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/GRU_onlycode_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [35]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.949,0.802,0.779,0.692,0.888,0.952,0.615,0.121,0.299,0.35


### CNN

In [36]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['SC_no_comments'], X.iloc[val_index]['SC_no_comments']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='CNN', max_features=WORD, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/CNN_onlycode_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/CNN_onlycode_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [37]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.936,0.893,0.656,0.886,0.695,0.933,0.761,0.08,0.665,0.102
1,0.962,0.87,0.795,0.847,0.716,0.954,0.754,0.188,0.593,0.112
2,0.949,0.927,0.718,0.829,0.845,0.945,0.835,0.122,0.575,0.135


In [38]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['SC_no_comments'], X_test['SC_no_comments']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='CNN', max_features=WORD, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/CNN_onlycode_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/CNN_onlycode_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [39]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.952,0.895,0.707,0.868,0.781,0.953,0.794,0.1,0.663,0.168


## Only The Comments

In [40]:
names  = [f'aucroc_{i}' for i in ['exchanges', 'finance', 'gambling', 'games', 'high-risk']]
names2 = [f'aucprc_{i}' for i in ['exchanges', 'finance', 'gambling', 'games', 'high-risk']]
names.extend(names2)

kf  = StratifiedKFold(n_splits=SPLITS, random_state=RS, shuffle=True)
TRAIN = True

In [41]:
n_words  = range(500, 1200, 200)
length   = range(500, 1200, 200)
emb_dim  = [50, 100]
n_hidden = [50, 100]
epochs   = [4]
lr       = [0.01, 0.001, 0.0001]
drop     = [0, 0.25]


n_words  = [1100]
length   = [500]
emb_dim  = [50]        
n_hidden = [100]      
drop     = [0]      
epochs   = [4]
lr       = [0.01]

### BGRU

In [42]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['comments'], X.iloc[val_index]['comments']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='BGRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/BGRU_comments_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/BGRU_comments_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [43]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.91,0.76,0.704,0.716,0.739,0.906,0.53,0.157,0.332,0.191
1,0.937,0.719,0.728,0.713,0.773,0.94,0.352,0.107,0.315,0.101
2,0.953,0.752,0.623,0.764,0.643,0.951,0.483,0.066,0.408,0.085


In [44]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['comments'], X_test['comments']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='BGRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/BGRU_comments_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/BGRU_comments_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [45]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.971,0.639,0.656,0.853,0.839,0.968,0.276,0.086,0.52,0.361


### GRU

In [46]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['comments'], X.iloc[val_index]['comments']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='GRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/GRU_comments_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/GRU_comments_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [47]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.902,0.729,0.683,0.761,0.8,0.906,0.373,0.081,0.408,0.128
1,0.928,0.854,0.688,0.748,0.802,0.933,0.683,0.105,0.413,0.146
2,0.937,0.725,0.715,0.724,0.781,0.939,0.485,0.103,0.373,0.143


In [48]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['comments'], X_test['comments']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='GRU', max_features=WORD, embed_dim=DIM, rec_units=HID, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/GRU_comments_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/GRU_comments_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [49]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.96,0.804,0.638,0.84,0.855,0.963,0.648,0.107,0.511,0.313


### CNN

In [50]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((SPLITS, len(names))), columns=names)
                for c, (train_index, val_index) in enumerate(kf.split(X, y)):

                        X_train, X_val      = X.iloc[train_index]['comments'], X.iloc[val_index]['comments']
                        y_train, y_val      = y.iloc[train_index], y.iloc[val_index] 
                        
                        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                        tokenizer.fit_on_texts(X_train)
                        
                        list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                        list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                        X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                        X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                        le = LabelBinarizer()
                        le.fit(y_train)

                        y_train = le.transform(y_train)[:,0:5]
                        y_val = le.transform(y_val)[:,0:5]

                        classifier = models.dl_model(model_type='CNN', max_features=WORD, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                        if TRAIN:
                                classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                                classifier.save_weights(f'{CV_PATH}/CNN_comments_fold_{c}_v1.h5')
                        else: 
                                classifier.load_weights(f'{CV_PATH}/CNN_comments_fold_{c}_v1.h5')
                        probs = classifier.predict(X_val)

                        assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                        
                        for i, cl in enumerate(le.classes_[le.classes_!='other']):
                            auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                            auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                            metrics_cv.loc[c, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                            metrics_cv.loc[c, f'aucprc_{cl}'] = round(auc_pr_class, 3)   

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [51]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.924,0.876,0.701,0.877,0.723,0.928,0.741,0.085,0.709,0.091
1,0.953,0.894,0.787,0.867,0.767,0.942,0.792,0.137,0.572,0.103
2,0.944,0.881,0.739,0.819,0.749,0.933,0.738,0.1,0.574,0.1


In [52]:
for WORD in n_words:
  for LENGTH in length:
    for DIM in emb_dim:
      for HID in n_hidden:
        for DROP in drop:
          for EPOCH in epochs:
            for RATE in lr:

                metrics_cv = pd.DataFrame(np.zeros((1, len(names))), columns=names)

                X_train, X_val      = X['comments'], X_test['comments']
                y_train, y_val      = y, y_test
                
                tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=WORD)
                tokenizer.fit_on_texts(X_train)
                
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)

                X_train = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=LENGTH)
                X_val   = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_val, maxlen=LENGTH)

                le = LabelBinarizer()
                le.fit(y_train)

                y_train = le.transform(y_train)[:,0:5]
                y_val = le.transform(y_val)[:,0:5]

                classifier = models.dl_model(model_type='CNN', max_features=WORD, dropout_rate=DROP, maxlen=LENGTH, classes=5, lr=RATE)
                if TRAIN:
                        classifier.fit(X_train, y_train, batch_size=128, epochs=4, shuffle=True, verbose=1)
                        classifier.save_weights(f'{TEST_PATH}/CNN_comments_v1.h5')
                else: 
                        classifier.load_weights(f'{TEST_PATH}/CNN_comments_v1.h5')
                probs = classifier.predict(X_val)

                assert (le.classes_ == ['exchanges', 'finance', 'gambling', 'games', 'high-risk', 'other']).all()
                
                for i, cl in enumerate(le.classes_[le.classes_!='other']):
                    auc_roc_class = roc_auc_score(y_val[:,i], probs[:,i])
                    auc_pr_class  = average_precision_score(y_val[:,i], probs[:,i])

                    metrics_cv.loc[0, f'aucroc_{cl}'] = round(auc_roc_class, 3)
                    metrics_cv.loc[0, f'aucprc_{cl}'] = round(auc_pr_class, 3) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [53]:
metrics_cv

Unnamed: 0,aucroc_exchanges,aucroc_finance,aucroc_gambling,aucroc_games,aucroc_high-risk,aucprc_exchanges,aucprc_finance,aucprc_gambling,aucprc_games,aucprc_high-risk
0,0.937,0.907,0.829,0.865,0.703,0.937,0.799,0.155,0.6,0.07
