# Experiment setup
In the cell below we set which experiment to run:
- Which transformer model we want to use.
- If we want indomain or cross domain testing.
- If we want [CLS]-token or sentence embeddings
- If we want to train the final version or are still tuning the parameters

In [1]:
# set here the folder that contains the data folder (which in turn contains all the required datasets)
dir = '/content/drive/MyDrive/subjectivity_mining/'

# setup which experiment to run
model_name = "diptanu/fBERT"  # select: "diptanu/fBERT", "bert-base-uncased", or path to fine-tuned model
domain = 'in' # select 'in' or 'cross
embedding_type = 'CLS' # select 'sentence' or 'CLS'
full_train = True # train with all data or do evaluation setups

# Alternative Classification Head


In [2]:
# to supress the output here
%%capture 

# install prerequisites
!pip install simpletransformers
!pip install sentence_transformers

In [7]:
!nvidia-smi

Fri Oct 14 14:42:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    53W / 400W |   3340MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import pandas as pd, numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.notebook import tqdm
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Load the needed data

In [4]:
# setup data sets
if domain == 'cross':
    # cross domain HASOC data
    if full_train == True:
        train_df = pd.read_csv(f'{dir}data/hasoc-train-all.csv')
        val_df = pd.read_csv(f'{dir}data/olid-test.csv')
    else:
        train_df = pd.read_csv(f'{dir}data/hasoc-train-small.csv') # for fine-tuning
        val_df = pd.read_csv(f'{dir}data/hasoc-dev.csv') # for fine-tuning
else:
    # indomain OLID data
    if full_train == True:
        train_df = pd.read_csv(f'{dir}data/olid-train-all.csv')
        val_df = pd.read_csv(f'{dir}data/olid-test.csv')
    else:
        train_df = pd.read_csv(f'{dir}data/olid-train-small.csv') # for fine-tuning
        val_df = pd.read_csv(f'{dir}data/olid-dev.csv') # for fine-tuning

# Set the functions to get both CLS and sentence embeddings

In [5]:
# Getting the CLS token embedding
def get_cls_embeddings(model, train_data, val_data, batch_size=8):
    """Extract the [CLS] token embeddings for the input train and evaluation data 
       from the requested model.

    Args:
        model (str): path to the transformer model (allows huggingface directions)
        train_data (df): DF containing the traindata, strings under header: 'text'
        val_data (df): DF containing the eval data, strings under header: 'text'
        batch_size (int): integer to set the batch size, default: 8

    Returns:
        cls_train, cls_val: lists of embeddings
    """

    # models we use: "/content/drive/MyDrive/subjectivity_mining/outputs/final_bert-base-uncased_model/"
    #                "bert-base-uncased"
    #                "diptanu/fBERT"
    tokenizer = AutoTokenizer.from_pretrained(model)
    model = AutoModel.from_pretrained(model)

    # move model to the 'gpu' or 'cpu'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # get list of train data
    train_data = train_data["text"].values.tolist()
    val_data = val_data["text"].values.tolist()
    # seperate into batches
    batched_train = [train_data[i:i+batch_size] for i in range(0, len(train_data), batch_size)] 
    batched_val = [val_data[i:i+batch_size] for i in range(0, len(val_data), batch_size)] 

    cls_train=[]
    cls_val=[]

    # get [CLS] embeddings per batch for train data
    for batch in tqdm(batched_train):
        # Tokenize data
        tokenized_train = tokenizer(batch, padding = True, truncation = True, return_tensors="pt")
        # Move on device (CPU/GPU)
        tokenized_train = {k:torch.tensor(v).to(device) for k,v in tokenized_train.items()}
        # Compute the embeddings using the model
        with torch.no_grad():
            hidden_train = model(**tokenized_train)
        # get the CLS token and add to collection list
        cls_tr = hidden_train.last_hidden_state[:,0,:] 
        # add the batch to the collections of all embeddings
        cls_train+=cls_tr

    # same for validation
    for batch in tqdm(batched_val):
        tokenized_val = tokenizer(batch, padding = True, truncation = True,  return_tensors="pt")
        tokenized_val = {k:torch.tensor(v).to(device) for k,v in tokenized_val.items()}
        with torch.no_grad():
            hidden_val = model(**tokenized_val)
        cls_v = hidden_val.last_hidden_state[:,0,:]
        cls_val+=cls_v

    
    # because of the batch process, the output [cls] data is padded to 8 for the final batch (if that is less than 8)
    # drop any elements in the list beyond the original length
    cls_train=cls_train[:len(train_data)]   
    cls_val=cls_val[:len(val_data)]

    # set the tensors to a cpu base, transform to a list, and add to a list
    cls_train = [list(torch.tensor(x, device = 'cpu')) for x in cls_train]
    cls_val = [list(torch.tensor(x, device = 'cpu')) for x in cls_val]

    return cls_train, cls_val



# Alternative method using SBERT (Reimers et al.) (https://www.sbert.net/)
# It has a pooling function that averages all word embeddings in the model's output, using the attention masks.\
# supposedly results in a better sentence embedding.
# Not used in assignment 3, but perhaps useful for assignment 4
def get_embeddings(model, train_data, val_data):
    """Extract the sentence embedding for the input train and evaluation data 
       from the requested model. Uses Sentence-transformers method of pooling 
       the transformer model output

    Args:
        model (str): path to the transformer model (allows huggingface directions)
        train_data (df): DF containing the traindata, strings under header: 'text'
        val_data (df): DF containing the eval, strings under header: 'text'

    Returns:
        train_embeddings, val_embeddings: lists of embeddings
    """

    # Load Transformer model
    sentence_model = SentenceTransformer(model)

    # Get the pooled embeddings
    train_embeddings = sentence_model.encode(train_data["text"], show_progress_bar=True)
    val_embeddings = sentence_model.encode(val_data["text"], show_progress_bar=True)

    return train_embeddings, val_embeddings


# Get the embeddings & set the labels

In [6]:
if embedding_type == 'CLS':
    train_embeddings, val_embeddings = get_cls_embeddings(model_name, train_df, val_df)
else:
    train_embeddings, val_embeddings = get_embeddings(model_name, train_df, val_df)
  
train_targets = train_df.labels
val_targets = val_df.labels

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/604 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at diptanu/fBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at diptanu/fBERT and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should prob

  0%|          | 0/732 [00:00<?, ?it/s]



  0%|          | 0/108 [00:00<?, ?it/s]



In [None]:
# setup SVM head model
model = SVC(kernel='rbf')

# fit data to model (i.e. Train)
model.fit(train_embeddings, train_targets)

# create predictions with test data
svmpredictions = model.predict(val_embeddings)
print('model:', model_name, '\nEmbedding type:', embedding_type, '\tdomain:', domain)
print(classification_report(val_targets, svmpredictions))

model: diptanu/fBERT 
Embedding type: CLS 	domain: in
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       620
           1       0.76      0.69      0.72       240

    accuracy                           0.85       860
   macro avg       0.82      0.80      0.81       860
weighted avg       0.85      0.85      0.85       860



In [None]:
confusion_matrix(val_targets, svmpredictions)

array([[569,  51],
       [ 75, 165]])