# CSR notebook

Progressive Class Semantic Matching for Semi-Supervised Text Classification [arXiv link](https://arxiv.org/abs/2205.10189).

Version 7 corresponds to **smallBERT+F** in the thesis.

Version 7 uses the fine-tuned bert-base-uncased version that only has had 37 samples of each class together with FOCAL loss.

## Install (possibly)

In [None]:
#Kaggle & Collab: nothing to install
#%pip install torch datasets torchmetrics #Azure

## Imports & Setup

In [None]:
#Local setup:
%cd ..

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Loading data

In [None]:
from src.CustomDataLoader import CustomDataLoader

data_files = {"train": "DBPEDIA_train.csv", "test": "DBPEDIA_test.csv", "validation": "DBPEDIA_val.csv"}
loader = CustomDataLoader(name="DeveloperOats/DBPedia_Classes", data_files=data_files)

dataset = loader.load_huggingface_data()
subsets = list(dataset.keys())
dfs = loader.to_dataframe(data_dict=dataset, subsets=subsets)

#class_weights = loader.compute_class_weight_normal(dataframe=dfs['train'], label="l1")
#class_weights = loader.compute_class_weight_sqrt(dataframe=dfs['train'], label="l1")
#class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights = None

df_train = dfs["train"]
df_validation = dfs["validation"]
df_test = dfs["test"]

# Keyword embeddings

In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv()
TOKEN = os.getenv('TOKEN')

login(TOKEN)

In [None]:
from src.KeywordEmbeddings import KeywordEmbeddings

#model_id_keywords = "bert-base-uncased" # No fine-tuning
model_id_keywords = "TheChickenAgent/DBPedia_Classes_BERT-base-cased-37-6-2" # Model that is fine-tuned on 37 shots

keyword_embedder = KeywordEmbeddings(model_id_keywords=model_id_keywords, device=device)
split_labels = keyword_embedder.sublabel_keywords(dfs=dfs, keywords_same=True, main_label="l1", sub_label="l2")
print() #for nice spacing with the warning from the model loading process
print(split_labels["train"])

l1_to_l2_encoded = keyword_embedder.encoded_mapping(split_labels)

In [None]:
keyword_embeddings = torch.stack(list(l1_to_l2_encoded.values()))
#keyword_embeddings.size() #this is [9, 768] == [num_classes, bert_dim]

# Sentence Embeddings

In [None]:
from src.SentenceEmbeddings import SentenceEmbeddings

sentence_embedder = SentenceEmbeddings(model_id_keywords=model_id_keywords, device=device)
#Example:
#encoding(s) = sentence_embedder.encode([sentence1, sentence2])

# The model

In [None]:
## RESTRICT TO FEW-SHOTS
SHOTS = 100
df_train_shots = loader.selectEqualFewshots(df_train, label_name = "l1", text_name = "text", shots=SHOTS, seed=21)

In [None]:
from torch.utils.data import DataLoader
from src.CustomTextDataset import CustomTextDataset

#training_data = CustomTextDataset(file = df_train, label_name = "l1", text_name = "text")
training_data = CustomTextDataset(file = df_train_shots, label_name = "l1", text_name = "text")
testing_data = CustomTextDataset(file = df_test, label_name = "l1", text_name = "text")
validation_data = CustomTextDataset(file = df_validation, label_name = "l1", text_name = "text")

gen = torch.Generator().manual_seed(21)

#Shuffle set to True to shuffle the training instances s.t. we do not have 10 of class A, then 10 of class B etc.
train_dataloader = DataLoader(training_data, batch_size=18, shuffle=True, generator=gen)
test_dataloader = DataLoader(testing_data, batch_size=64, shuffle=False, generator=gen)
validation_dataloader = DataLoader(validation_data, batch_size=64, shuffle=False, generator=gen)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le = le.fit(dfs["train"]["l1"])

In [None]:
le.inverse_transform([0,1,2,3,4,5,6,7,8])

In [None]:
num_classes = len(list(le.classes_))

In [None]:
sentence_dim = sentence_embedder.sentence_dim
keyword_dim = keyword_embedder.keyword_dim

combined_dim = sentence_dim + keyword_dim

In [None]:
from src.GenericModels import k_classifier, matching_classifier

model_focal = k_classifier(num_classes=num_classes, sTrans_dim=sentence_dim).to(device)
model_k_way = k_classifier(num_classes=num_classes, sTrans_dim=sentence_dim).to(device)
model_matching = matching_classifier(combined_dim=combined_dim).to(device)

In [None]:
LEARNING_RATE = 5e-2
EPOCHS = 150

# Get optimizer
opti = torch.optim.SGD((list(model_k_way.parameters()) + list(model_matching.parameters()) + list(model_focal.parameters())), lr=LEARNING_RATE)

## Training

In [None]:
from src.CustomImplementationV3 import CustomImplementation

#%%time
f = CustomImplementation(sTrans=sentence_embedder, device=device, keyword_embeddings=keyword_embeddings, label_encoder=le, class_weights=class_weights, optimizer = opti)

for t in range(EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    if(t == EPOCHS-1):
        f.train_loop(train_dataloader, model_focal, model_k_way, model_matching, plot_cm=True)
    else:
        f.train_loop(train_dataloader, model_focal, model_k_way, model_matching)
print("Done!")

## Validation

The results from the validation set are currently not used. The reason for this is because 37 samples are used for the fine-tuning process.

In [None]:
#%%time
#f.test_loop(validation_dataloader,  model_k_way, model_matching, plot_cm=True)

## Testing

In [None]:
#%%time
f.test_loop(test_dataloader, model_focal, model_k_way, model_matching, plot_cm=True)