In [1]:
try:
    import transformers
except ImportError:
    !pip install transformers
    import transformers
try:
    import emoji
except ImportError:
    !pip install emoji
    import emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 8.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 38.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 28.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstall

In [2]:
local = False
local_run = False

In [8]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification, TFRobertaForSequenceClassification
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix,f1_score

import random
import string
import re
import ast

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import tensorflow_datasets as tfds

if local_run:
    nltk.data.path.append('/Users/algin/VOLD/nltk_data')
    nltk.download('stopwords',download_dir='/Users/algin/VOLD/nltk_data')
    nltk.download('punkt',download_dir='/Users/algin/VOLD/nltk_data')
else:
    nltk.download('stopwords')
    nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
if local:
    path = '/Users/algin/Greenwich/MSc Project/models/'
else:
    from google.colab import drive
    drive.mount('/content/drive')
    path = 'drive/MyDrive/MSc Data Science/MSc Project/models/'

Mounted at /content/drive


In [5]:
def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis are
    # matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

In [6]:
banned_list= string.punctuation
punctuation_reg_exp = "[" + banned_list + "]"
emoji_reg_exp = get_emoji_regexp()

def stemmer(text):
    tokenized = nltk.word_tokenize(text)
    ps = PorterStemmer()
    return ' '.join([ps.stem(words) for words in tokenized])

def clean_text(text,stem=True):
    text = text.replace('\r', '').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)

    text = [word for word in text.split() if word not in stop_words]
    text = ' '.join(text)

    text = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text))
    
    text = re.sub(punctuation_reg_exp,"",text)

    text = re.sub("\s\s+" , " ", text)

    text = re.sub(emoji_reg_exp, r"", text)

    if stem:
        text = stemmer(text)
    return text

**Load Dataset and transformations**

In [7]:
full_train_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/uniform/uniform_train_data.csv?raw=true")
test_data = pd.read_csv("https://github.com/Voldegin/hate_speech_detection/blob/develop/data/uniform/uniform_test_data.csv?raw=true")

In [None]:
# train_data, val_data = train_test_split(full_train_data,test_size=5000,random_state=21)

In [9]:
split = StratifiedShuffleSplit(n_splits=2,test_size=0.1, random_state=23)
for train_index, val_index in split.split(full_train_data[['tweet_text','cleaned']],full_train_data['is_cyberbullying']):
    train_data = full_train_data.loc[train_index]
    val_data = full_train_data.loc[val_index]

In [10]:
def split_label_and_feature(data):
    return data['tweet_text'], data['is_cyberbullying']

In [11]:
X_train, y_train = split_label_and_feature(train_data)
X_val, y_val = split_label_and_feature(val_data)
X_test, y_test = split_label_and_feature(test_data)

**Prediction on Best XGB Model**

In [12]:
file_name = path + "uniform_best_xgb.pkl"
xgb_model = pickle.load(open(file_name, "rb"))

In [13]:
xgb_model.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1.0,
 'gamma': 0.5,
 'learning_rate': 0.06,
 'max_delta_step': 0,
 'max_depth': 15,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 300,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 0.6,
 'verbosity': 1}

In [14]:
def split_label_and_feature_xgb(data):
    return data['cleaned'], data['is_cyberbullying']

In [15]:
X_train_xgb, y_train = split_label_and_feature_xgb(train_data)
X_val_xgb, y_val = split_label_and_feature_xgb(val_data)
X_test_xgb, y_test = split_label_and_feature_xgb(test_data)

In [16]:
clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train_xgb)

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)

In [17]:
def xgb_prediction(text_list,preprocess=True):
    df = pd.Series(text_list)
    if preprocess:
        df = df.apply(clean_text)
    test_cv = clf.transform(df)
    test_tf = tf_transformer.transform(test_cv)
    predictions = xgb_model.predict(test_tf)
    return predictions

In [18]:
xgb_prediction(["I believe in Christianity"])

array([0])

In [19]:
xgb_prediction(["What a good day"])

array([0])

In [20]:
xgb_prediction(["Muslims are terrorists"])

array([1])

In [21]:
xgb_prediction(["You are an asshole"])

array([0])

In [22]:
xgb_test_predictions = xgb_prediction(X_test_xgb,preprocess=False)

In [23]:
confusion_matrix(y_test,xgb_test_predictions)

array([[4106,  822],
       [1049, 3874]])

In [24]:
f1_score(y_test,xgb_test_predictions)

0.8054891360848321

**Prediction on Transformer Model**

In [25]:
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained(path + "uniform-distilbert")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at drive/MyDrive/MSc Data Science/MSc Project/models/uniform-distilbert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [26]:
def transformer_prediction(text_list,preprocess=True,return_one=False):  
    #tokenize the text
    if preprocess:
        new_list = []
        for each_text in text_list:
            new_list.append(clean_text(each_text,stem=False))
    else:
        new_list = text_list
    encodings = tokenizer(new_list, 
                          truncation=True, 
                          padding=True)
    #transform to tf.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #predict
    preds = distilbert_model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()

    if return_one:
        return res.argmax(axis=1)  
    
    return res

In [27]:
transformer_prediction(["What a good day"])

array([[0.992743  , 0.00725702]], dtype=float32)

In [28]:
transformer_prediction(["Muslims are terrorists"])

array([[0.01869874, 0.9813013 ]], dtype=float32)

In [29]:
transformer_prediction(["You are an asshole"])

array([[0.06463311, 0.9353669 ]], dtype=float32)

In [30]:
transformer_prediction(["I believe in Christianity"])

array([[0.93613213, 0.0638679 ]], dtype=float32)

In [31]:
check_X = X_test#.sample(100)
check_y = y_test[check_X.index]
check_X = check_X.tolist()

In [32]:
distil_predictions = transformer_prediction(check_X,return_one=True,preprocess=False)

In [33]:
confusion_matrix(check_y,distil_predictions)

array([[4246,  682],
       [ 797, 4126]])

In [34]:
f1_score(check_y,distil_predictions)

0.8480115096084677

**Prediction on Roberta Model**

In [35]:
roberta_model = TFRobertaForSequenceClassification.from_pretrained(path + "uniform-roberta")
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at drive/MyDrive/MSc Data Science/MSc Project/models/uniform-roberta.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [36]:
batch_size = 64

In [37]:
max_length = 128
def convert_example_to_feature(text):
  return roberta_tokenizer.encode_plus(text,
                                       add_special_tokens=True,
                                       max_length=max_length,
                                       pad_to_max_length=True,
                                       return_attention_mask=True,
  )

def map_example_to_dict(input_ids, attention_masks, label):
    return {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
           }, label

def encode_examples(ds, limit=-1):
     # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  attention_mask_list = []
  label_list = []
  if (limit > 0):
    ds = ds.take(limit)
  for text, label in tfds.as_numpy(ds):
    bert_input = convert_example_to_feature(text.decode())
    input_ids_list.append(bert_input['input_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])
  return tf.data.Dataset.from_tensor_slices((input_ids_list,
                                             attention_mask_list,
                              label_list)).map(map_example_to_dict)

In [38]:
def predict_proba(text_list, model, preprocess=True, return_one=True):
    if preprocess:
        new_list = []
        for each_text in text_list:
            new_list.append(clean_text(each_text,stem=False))
    else:
        new_list = text_list
    print(new_list)
    df = pd.DataFrame(new_list, columns=['text'])
    df['label'] = 0
    sentences_modified = tf.data.Dataset.from_tensor_slices((df['text'],df['label']))
    ds_encoded = encode_examples(sentences_modified).batch(batch_size)

    # preds_raw = tf.nn.softmax(model.predict(ds_encoded).logits)
    # preds = tf.math.argmax(preds_raw, axis=1)

    preds = model.predict(ds_encoded).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()

    if return_one:
        return res.argmax(axis=1)  
    
    return res

In [39]:
check_X = X_test#.sample(100)
check_y = y_test[check_X.index]
check_X = check_X.tolist()

In [40]:
roberta_predictions = predict_proba(check_X,roberta_model,return_one=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.






In [41]:
confusion_matrix(check_y,roberta_predictions)

array([[3504, 1424],
       [ 428, 4495]])

In [42]:
f1_score(check_y,roberta_predictions)

0.8291828076000738