In [1]:
try:
    import transformers
except ImportError:
    !pip install transformers
    import transformers
try:
    import emoji
except ImportError:
    !pip install emoji
    import emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 7.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [2]:
local = False
local_run = False

In [3]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,f1_score

import random
import string
import re
import ast

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import tensorflow_datasets as tfds

if local_run:
    nltk.data.path.append('/Users/algin/VOLD/nltk_data')
    nltk.download('stopwords',download_dir='/Users/algin/VOLD/nltk_data')
    nltk.download('punkt',download_dir='/Users/algin/VOLD/nltk_data')
else:
    nltk.download('stopwords')
    nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
if local:
    path = '/Users/algin/Greenwich/MSc Project/models/'
else:
    from google.colab import drive
    drive.mount('/content/drive')
    path = 'drive/MyDrive/MSc Data Science/MSc Project/models/'

Mounted at /content/drive


In [5]:
def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis are
    # matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

In [6]:
banned_list= string.punctuation
punctuation_reg_exp = "[" + banned_list + "]"
emoji_reg_exp = get_emoji_regexp()

def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

def clean_text(text):
    text = text.replace('\r', '').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)

    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)

    text = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text))
    
    text = re.sub(punctuation_reg_exp,"",text)

    text = re.sub("\s\s+" , " ", text)

    text = re.sub(emoji_reg_exp, r"", text)

    # text = stemmer(text)
    return text

**Load Dataset and transformations for XGBoost Model**

In [38]:
full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/3-model-experiments/data/cleaned/cleaned_train_data.csv?raw=true")
test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/3-model-experiments/data/cleaned/cleaned_test_data.csv?raw=true")

In [39]:
train_data, val_data = train_test_split(full_train_data,test_size=5000,random_state=21)

In [40]:
def split_label_and_feature(data):
    return data['cleaned'], data['is_cyberbullying']

In [41]:
X_train, y_train = split_label_and_feature(train_data)
X_val, y_val = split_label_and_feature(val_data)
X_test, y_test = split_label_and_feature(test_data)

In [42]:
clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)

**Prediction on Best XGB Model**

In [43]:
file_name = path + "best_xgb.pkl"
xgb_model = pickle.load(open(file_name, "rb"))

In [44]:
xgb_model.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.6,
 'gamma': 0.5,
 'learning_rate': 0.06,
 'max_delta_step': 0,
 'max_depth': 10,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 800,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 0.8,
 'verbosity': 1}

In [45]:
def xgb_prediction(text_list,preprocess=True):
    df = pd.Series(text_list)
    if preprocess:
        df = df.apply(clean_text)
    test_cv = clf.transform(df)
    test_tf = tf_transformer.transform(test_cv)
    predictions = xgb_model.predict(test_tf)
    return predictions

In [46]:
xgb_prediction(["I believe in Christianity"])

array([0])

In [47]:
xgb_prediction(["What a good day"])

array([0])

In [48]:
xgb_prediction(["Muslims are terrorists"])

array([0])

In [49]:
xgb_prediction(["You are an asshole"])

array([0])

In [50]:
xgb_test_predictions = xgb_prediction(X_test,preprocess=False)

In [51]:
confusion_matrix(y_test,xgb_test_predictions)

array([[3477,   22],
       [2596,  899]])

In [52]:
f1_score(y_test,xgb_test_predictions)

0.4071557971014493

In [54]:
xgb_check = pd.concat([y_test,pd.Series(xgb_test_predictions)],axis=1)
xgb_check.columns = ['actuals','predictions']
fn_xgb = xgb_check[(xgb_check['actuals'] == 1) & (xgb_check['predictions'] == 0)]
fn_xgb_test_data = test_data.loc[fn_xgb.index]
fn_xgb_test_data['actual_value'].value_counts()

['malignant']                                                   1142
['malignant', 'rude', 'abuse']                                   475
['malignant', 'rude']                                            268
['malignant', 'abuse']                                           202
['malignant', 'highly_malignant', 'rude', 'abuse']                92
['rude']                                                          67
['abuse']                                                         64
['malignant', 'rude', 'abuse', 'loathe']                          59
['rude', 'abuse']                                                 50
['malignant', 'threat']                                           24
['malignant', 'loathe']                                           23
['malignant', 'abuse', 'loathe']                                  21
['malignant', 'highly_malignant', 'rude']                         16
['malignant', 'highly_malignant', 'rude', 'abuse', 'loathe']      14
['malignant', 'rude', 'threat', 'a

**Prediction on Transformer Model**

In [None]:
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained(path + "distilbert--without-stem-94")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Some layers from the model checkpoint at drive/MyDrive/MSc Data Science/MSc Project/models/distilbert--without-stem-94 were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at drive/MyDrive/MSc Data Science/MSc Project/models/distilbert--without-stem-94 and are newly initialized: ['dropout_115']
You should probably TRAIN this model on a down-stream task to 

In [None]:
def transformer_prediction(text_list,preprocess=True,return_one=False):  
    #tokenize the text
    if preprocess:
        new_list = []
        for each_text in text_list:
            new_list.append(clean_text(each_text))
    else:
        new_list = text_list
    encodings = tokenizer(new_list, 
                          truncation=True, 
                          padding=True)
    #transform to tf.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #predict
    preds = distilbert_model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()

    if return_one:
        return res.argmax(axis=1)  
    
    return res

In [None]:
transformer_prediction(["What a good day"])

array([[9.994949e-01, 5.051067e-04]], dtype=float32)

In [None]:
transformer_prediction(["Muslims are terrorists"])

array([[6.712463e-05, 9.999329e-01]], dtype=float32)

In [None]:
transformer_prediction(["You are an asshole"])

array([[0.21792698, 0.782073  ]], dtype=float32)

In [None]:
transformer_prediction(["I believe in Christianity"])

array([[0.7687712 , 0.23122883]], dtype=float32)

In [None]:
check_X = X_test#.sample(100)
check_y = y_test[check_X.index]
check_X = check_X.tolist()

In [None]:
distil_predictions = transformer_prediction(check_X,return_one=True,preprocess=False)



In [None]:
confusion_matrix(check_y,distil_predictions)

array([[3444,   55],
       [2459, 1036]])

In [None]:
f1_score(check_y,distil_predictions)

0.45180985608373314

In [None]:
distil_check = pd.concat([check_y,pd.Series(distil_predictions)], axis=1)
distil_check.columns = ['actuals','predictions']
fn_distil = distil_check[(distil_check['actuals'] == 1) & (distil_check['predictions'] == 0)]
fn_distil_test_data = test_data.loc[fn_distil.index]
fn_distil_test_data['actual_value'].value_counts()

['malignant']                                                             1101
['malignant', 'rude', 'abuse']                                             416
['malignant', 'rude']                                                      265
['malignant', 'abuse']                                                     186
['malignant', 'highly_malignant', 'rude', 'abuse']                          83
['rude']                                                                    62
['abuse']                                                                   60
['malignant', 'rude', 'abuse', 'loathe']                                    57
['rude', 'abuse']                                                           45
['malignant', 'threat']                                                     23
['malignant', 'loathe']                                                     22
['malignant', 'abuse', 'loathe']                                            19
['malignant', 'highly_malignant', 'rude']           

**Prediction on Roberta Model**

In [None]:
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained(path + "roberta")
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

You are using a model of type roberta to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some layers from the model checkpoint at drive/MyDrive/MSc Data Science/MSc Project/models/roberta were not used when initializing TFDistilBertForSequenceClassification: ['roberta', 'classifier/out_proj/bias:0', 'classifier/out_proj/kernel:0', 'classifier/dense/kernel:0', 'classifier/dense/bias:0']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSeq

In [None]:
batch_size = 64

In [None]:
max_length = 128
def convert_example_to_feature(text):
  return roberta_tokenizer.encode_plus(text,
                                       add_special_tokens=True,
                                       max_length=max_length,
                                       pad_to_max_length=True,
                                       return_attention_mask=True,
  )

def map_example_to_dict(input_ids, attention_masks, label):
    return {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
           }, label

def encode_examples(ds, limit=-1):
     # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
    ds = ds.take(limit)
  for text, label in tfds.as_numpy(ds):
    bert_input = convert_example_to_feature(text.decode())
    input_ids_list.append(bert_input['input_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list,
                                             attention_mask_list,
                              label_list)).map(map_example_to_dict)

In [None]:
def predict_proba(text_list, model,return_one=True):
    df = pd.DataFrame(text_list, columns=['text'])
    df['label'] = 0
    sentences_modified = tf.data.Dataset.from_tensor_slices((df['text'],df['label']))
    ds_encoded = encode_examples(sentences_modified).batch(batch_size)

    # preds_raw = tf.nn.softmax(model.predict(ds_encoded).logits)
    # preds = tf.math.argmax(preds_raw, axis=1)

    preds = model.predict(ds_encoded).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()

    if return_one:
        return res.argmax(axis=1)  
    
    return res

In [None]:
check_X = X_test#.sample(100)
check_y = y_test[check_X.index]
check_X = check_X.tolist()

In [None]:
roberta_predictions = predict_proba(check_X,distilbert_model,return_one=True)



In [None]:
confusion_matrix(check_y,roberta_predictions)

array([[ 356, 3143],
       [ 551, 2944]])

In [None]:
f1_score(check_y,roberta_predictions)

0.6144854936338969

In [None]:
roberta_check = pd.concat([check_y,pd.Series(roberta_predictions)], axis=1)
roberta_check.columns = ['actuals','predictions']
fn_roberta = roberta_check[(roberta_check['actuals'] == 1) & (roberta_check['predictions'] == 0)]
fn_roberta_test_data = test_data.loc[fn_roberta.index]
fn_roberta_test_data['actual_value'].value_counts()

['malignant']                                                             189
['malignant', 'rude', 'abuse']                                            135
['malignant', 'rude']                                                      53
['malignant', 'abuse']                                                     50
['malignant', 'highly_malignant', 'rude', 'abuse']                         32
['malignant', 'rude', 'abuse', 'loathe']                                   19
['malignant', 'highly_malignant', 'rude', 'abuse', 'loathe']                9
['malignant', 'threat']                                                     9
['abuse']                                                                   8
['rude']                                                                    7
['malignant', 'loathe']                                                     6
['malignant', 'abuse', 'loathe']                                            4
['rude', 'abuse']                                               