# Testing baseline model with BERT Embedding

## Load data

In [1]:
import pandas as pd
df = pd.read_csv('./data/train.csv')
X = df['text']
Y = df['suicide']

## Pre-processing

### WordVectorization with Bert Embedding

In [2]:
X = [sen for sen in X]

labels = Y.values

### Tokenize

In [3]:
from transformers import BertTokenizer

# get pre-trained tokenizer model
tokenizer = BertTokenizer.from_pretrained('./pretrained/bert-base-uncased')

tokenizer

BertTokenizer(name_or_path='./pretrained/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [4]:
max_length = 512

tokenized = tokenizer(X, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

sen_ids = tokenized['input_ids']

print(sen_ids.size())

torch.Size([185659, 512])


### Word embedding

In [5]:
from transformers.models.bert.modeling_bert import BertEmbeddings
from transformers import AutoConfig

config = AutoConfig.from_pretrained('./pretrained/bert-base-uncased')
config.hidden_size = 8
bert_embedding = BertEmbeddings(config)

bert_embedding

BertEmbeddings(
  (word_embeddings): Embedding(30522, 8, padding_idx=0)
  (position_embeddings): Embedding(512, 8)
  (token_type_embeddings): Embedding(2, 8)
  (LayerNorm): LayerNorm((8,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [6]:
embedded = bert_embedding(sen_ids)

In [7]:
embedded = embedded.view(len(embedded), -1)

embedded.size()

torch.Size([185659, 4096])

In [8]:
del tokenized
del sen_ids
del X
del Y
del df

import gc
gc.collect()

33

## Building the baseline model

For testing purposes, we will use the Logistic Regression classifier `LogisticRegression()` with its default settings as our baseline model.

In [9]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=10000)

## Evaluating the Baseline Model

### 5-fold Stratified Cross-validation

In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=60)
accuracy_score_list, recall_score_list, precision_score_list, f1_score_list = [], [], [], []

for time, (train_index, test_index) in enumerate(skfolds.split(embedded, labels)):
    X_train, X_test = embedded[train_index].detach().numpy(), embedded[test_index].detach().numpy()
    Y_train, Y_test = labels[train_index], labels[test_index]

    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    print(clf.n_iter_)

    # Evaluate model
    AccuracyScore = accuracy_score(Y_test, y_pred)
    RecallScore = recall_score(Y_test, y_pred)
    PrecisionScore = precision_score(Y_test, y_pred)
    F1Score = f1_score(Y_test, y_pred)

    # Add to lists
    accuracy_score_list.append(AccuracyScore)
    recall_score_list.append(RecallScore)
    precision_score_list.append(PrecisionScore)
    f1_score_list.append(F1Score)

    # Print the matrix
    print('Time: ', time + 1)
    print('\taccuracy_score: ', AccuracyScore)
    print('\trecall_score: ', RecallScore)
    print('\tprecision_score: ', PrecisionScore)
    print('\tf1_score: ', F1Score)

[557]
Time:  1
	accuracy_score:  0.7721372401163417
	recall_score:  0.7446831421956603
	precision_score:  0.7880911680911681
	f1_score:  0.7657724995155439
[896]
Time:  2
	accuracy_score:  0.7724334805558548
	recall_score:  0.7454232177471463
	precision_score:  0.7881134008880792
	f1_score:  0.7661741103547512
[1106]
Time:  3
	accuracy_score:  0.7718679306258752
	recall_score:  0.7424617704070644
	precision_score:  0.7889798020255192
	f1_score:  0.7650142861105718
[891]
Time:  4
	accuracy_score:  0.7715716901863622
	recall_score:  0.7404695240146457
	precision_score:  0.7897094291949006
	f1_score:  0.7642972266992719
[947]
Time:  5
	accuracy_score:  0.7726428052031995
	recall_score:  0.7420848589274176
	precision_score:  0.7905242629344958
	f1_score:  0.7655390768205299


In [11]:
import numpy as np

# Print the average value of each matrix
print("Accuracy: {:.2%}".format(np.average(accuracy_score_list)))
print("Recall: {:.2%}".format(np.average(recall_score_list)))
print("Precision: {:.2%}".format(np.average(precision_score_list)))
print("F1_score: {:.2%}".format(np.average(f1_score_list)))

Accuracy: 77.21%
Recall: 74.30%
Precision: 78.91%
F1_score: 76.54%


## Save the model

In [12]:
import pickle

clf.fit(embedded.detach().numpy(), labels)
print(clf.n_iter_)

with open('./model/LR_bert.pickle', 'wb') as f:
    pickle.dump(clf, f)

[649]


using code below to load the fitted model:

``` python
with open('./model/LR_bert.pickle', 'rb') as f:
    clf = pickle.load(f)
```