# CSR notebook

Progressive Class Semantic Matching for Semi-Supervised Text Classification [arXiv link](https://arxiv.org/abs/2205.10189).

Baseline version that only uses the K-way classifier.

## Install (possibly)

In [None]:
#Kaggle & Collab: nothing to install
#%pip install torch datasets torchmetrics #Azure

## Imports & Setup

In [None]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, random_split,DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#Local setup:
%cd ..

## Loading data

In [None]:
from src.CustomDataLoader import CustomDataLoader

data_files = {"train": "DBPEDIA_train.csv", "test": "DBPEDIA_test.csv", "validation": "DBPEDIA_val.csv"}
loader = CustomDataLoader(name="DeveloperOats/DBPedia_Classes", data_files=data_files)

dataset = loader.load_huggingface_data()
subsets = list(dataset.keys())
dfs = loader.to_dataframe(data_dict=dataset, subsets=subsets)

#class_weights = loader.compute_class_weight_normal(dataframe=dfs['train'], label="l1")
#class_weights = loader.compute_class_weight_sqrt(dataframe=dfs['train'], label="l1")
#class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights = None

df_train = dfs["train"]
df_validation = dfs["validation"]
df_test = dfs["test"]

In [None]:
## RESTRICT TO FEW-SHOTS
SHOTS = 100
df_train_shots = loader.selectEqualFewshots(df_train, label_name = "l1", text_name = "text", shots=SHOTS, seed=21)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le = le.fit(dfs["train"]["l1"])
le.inverse_transform([0,1,2,3,4,5,6,7,8])

# Fine-tune BERT

## Data cleaning and setup

In [None]:
from src.BERTFineTuning import BERTFineTuning

fine_tuner = BERTFineTuning("bert-base-uncased", device)

In [None]:
df_train_shots['text'] = df_train_shots['text'].apply(lambda x: fine_tuner.clean_text(x))
#df_validation['text'] = df_validation['text'].apply(lambda x: fine_tuner.clean_text(x))
df_test['text'] = df_test['text'].apply(lambda x: fine_tuner.clean_text(x))

In [None]:
tweets_train = df_train_shots.text.values
labels_train = df_train_shots.l1.values

#tweets_validation = df_validation.text.values
#labels_validation = df_validation.l1.values

tweets_test = df_test.text.values
labels_test = df_test.l1.values

In [None]:
#Need to give integer instead of string label

_, labels_train = np.unique(labels_train, return_inverse=True)
#_, labels_validation = np.unique(labels_validation, return_inverse=True)
_, labels_test = np.unique(labels_test, return_inverse=True)

In [None]:
input_ids_train, attention_masks_train, labels_train = fine_tuner.tokenize(tweets_train, labels_train)
#input_ids_validation, attention_masks_validation, labels_validation = fine_tuner.tokenize(tweets_validation, labels_validation)
input_ids_test, attention_masks_test, labels_test = fine_tuner.tokenize(tweets_test, labels_test)

## Create a TensorDataset

In [None]:
# Combine the training inputs into a TensorDataset.
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

# Divide the dataset by randomly selecting samples.
train_dataset, _ = random_split(dataset_train, [len(dataset_train), 0])

#dataset_validation = TensorDataset(input_ids_validation, attention_masks_validation, labels_validation)
#val_dataset, _ = random_split(dataset_validation, [len(dataset_validation), 0])

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)
test_dataset, _ = random_split(dataset_test, [len(dataset_test), 0])

In [None]:
# The authors recommend batch_size = 16 or 32 for fine-tuning BERT.
# Because the amount of samples is so low, let's use 16.
batch_size = 16

# For training, we randomly sample a batch.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)

# Order does not matter for validation or testing dataloader, so we will just pick them.
#validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)

## The actual fine-tuning of the model

In [None]:
#Login to push to hub
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
TOKEN = os.getenv('TOKEN')

login(TOKEN)

In [None]:
# The authors recommend 2 to 4 epochs for fine-tuning. However, since we only have a few shots, we will just use 6 epochs.
epochs = 6
HuggingFaceRepoName = "TheChickenAgent/TemporaryRepo"
# Use a LR scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(fine_tuner.optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
fine_tuner.tune(train_dataloader, test_dataloader, scheduler, epochs, HuggingFaceRepoName)

# Loading the best model

In [None]:
model = torch.load('bert_model')

# Metrics

In [None]:
predictions = []
targets = []

for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        targets.extend(batch[2].to('cpu').numpy())
        with torch.no_grad():
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            
            pred_flat = np.argmax(logits, axis=1).flatten()
            predictions.extend(list(pred_flat))

In [None]:
from collections import Counter
print(Counter(predictions))
print(Counter(targets))

In [None]:
fine_tuner.calculate_metrics(y_pred = predictions, y_true = targets, averaging = "macro", device='cpu')

In [None]:
fine_tuner.calculate_metrics(y_pred = predictions, y_true = targets, averaging = "weighted", device='cpu')

# Keyword embeddings

In [None]:
#Login again, maybe the connection timed out

import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
TOKEN = os.getenv('TOKEN')

login(TOKEN)

In [None]:
from src.KeywordEmbeddings import KeywordEmbeddings

#model_id_keywords = "bert-base-uncased" # No fine-tuning
#model_id_keywords = "TheChickenAgent/DBPedia_Classes_BERT-base-cased-37-6-2" # Model that is fine-tuned on 37 shots
model_id_keywords = HuggingFaceRepoName #Take the model that was just fine-tuned through HuggingFace

keyword_embedder = KeywordEmbeddings(model_id_keywords=model_id_keywords, device=device)
split_labels = keyword_embedder.sublabel_keywords(dfs=dfs, keywords_same=True, main_label="l1", sub_label="l2")
print() #for nice spacing with the warning from the model loading process
print('Before adding labels', split_labels["train"])

#Adding the actual label to the encoding
for key in split_labels['train']:
    split_labels['train'][key].append(key)
    split_labels['test'][key].append(key)
    split_labels['validation'][key].append(key)

print() #for nice spacing
print('After adding labels', split_labels["train"])

l1_to_l2_encoded = keyword_embedder.encoded_mapping(split_labels)

In [None]:
keyword_embeddings = torch.stack(list(l1_to_l2_encoded.values()))
#keyword_embeddings.size() #this is [9, 768] == [num_classes, bert_dim]

# Sentence Embeddings

In [None]:
from src.SentenceEmbeddings import SentenceEmbeddings

sentence_embedder = SentenceEmbeddings(model_id_keywords=model_id_keywords, device=device)
#Example:
#encoding(s) = sentence_embedder.encode([sentence1, sentence2])

# The model

In [None]:
from torch.utils.data import DataLoader
from src.CustomTextDataset import CustomTextDataset

#training_data = CustomTextDataset(file = df_train, label_name = "l1", text_name = "text")
training_data = CustomTextDataset(file = df_train_shots, label_name = "l1", text_name = "text")
testing_data = CustomTextDataset(file = df_test, label_name = "l1", text_name = "text")
validation_data = CustomTextDataset(file = df_validation, label_name = "l1", text_name = "text")

gen = torch.Generator().manual_seed(21)

#Shuffle set to True to shuffle the training instances s.t. we do not have 10 of class A, then 10 of class B etc.
train_dataloader = DataLoader(training_data, batch_size=18, shuffle=True, generator=gen)
test_dataloader = DataLoader(testing_data, batch_size=64, shuffle=False, generator=gen)
validation_dataloader = DataLoader(validation_data, batch_size=64, shuffle=False, generator=gen)

In [None]:
le.inverse_transform([0,1,2,3,4,5,6,7,8])

In [None]:
num_classes = len(list(le.classes_))

In [None]:
sentence_dim = sentence_embedder.sentence_dim
keyword_dim = keyword_embedder.keyword_dim

combined_dim = sentence_dim + keyword_dim

In [None]:
from src.CustomImplementationBaseline import CustomImplementation

In [None]:
from src.GenericModels import k_classifier

model_k_way = k_classifier(num_classes=num_classes, sTrans_dim=sentence_dim).to(device)

In [None]:
LEARNING_RATE = 5e-2
EPOCHS = 150

# Get optimizer
opti = torch.optim.SGD(list(model_k_way.parameters()), lr=LEARNING_RATE)

## Training

In [None]:
%%time
f = CustomImplementation(sTrans=sentence_embedder, device=device, label_encoder=le, class_weights=class_weights, optimizer = opti)

for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    if(t == EPOCHS-1):
        f.train_loop(train_dataloader, model_k_way, plot_cm=True)
    else:
        f.train_loop(train_dataloader, model_k_way)
print("Done!")

## Validation

The results from the validation set are currently not used. The reason for this is because 37 samples are used for the fine-tuning process.

In [None]:
#%%time
#f.test_loop(test_dataloader, model_k_way, plot_cm=True)

## Testing

In [None]:
%%time
f.test_loop(test_dataloader, model_k_way, plot_cm=True)