In [1]:
import torch
import pandas as pd
import torch.nn as N
from torch.utils.data import DataLoader, Dataset
from torcheval.metrics import MultilabelAccuracy
from transformers import AutoTokenizer
import math
from torchinfo import summary
from IPython.display import clear_output
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batchSize = 8
dModel = 1024
vocabSize = len(tokenizer.get_vocab())
nLayers = 8
nHeads = 8
ffDim = 1024

lr = 7e-5

epochs = 10

## Approach 1: Transformer encoder + linear layers for binary classification

In [None]:
FILES_AND_LABELS_M1 = "./kaggle/input/fake-or-real-the-impostor-hunt/data/train_files_and_ground_truth-m1.csv"
TEST_FILES_M1 = "./kaggle/input/fake-or-real-the-impostor-hunt/data/test_files_m1.csv"
OUT_FILE_M1 = "./kaggle/working/fake-or-real-the-impostor-hunt/submission-method-1.csv"

maxSeqLen = tokenizer.model_max_length

### Dataset

In [None]:
class TrainDataset(Dataset):
	def __init__(self, path):
		super(TrainDataset, self).__init__()
		self.__filesAndLabels__: pd.DataFrame = pd.read_csv(path)
	
	def __len__(self):
		return self.__filesAndLabels__.shape[0]
	
	def __getitem__(self, index):
		inputPath, groundTruth = self.__filesAndLabels__.iloc[index]
		inputText = ""

		with open(inputPath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"
		
		return inputText, groundTruth
		

In [None]:
class TestDataset(Dataset):
	def __init__(self, path):
		super(TestDataset, self).__init__()
		self.__articlesAndFiles__: pd.DataFrame = pd.read_csv(path)
	
	def __len__(self):
		return self.__articlesAndFiles__.shape[0]
	
	def __getitem__(self, index):
		articleId, fileId, inputPath = self.__articlesAndFiles__.iloc[index]
		inputText = ""

		with open(inputPath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"
		
		return articleId, fileId, inputText

### Dataloader

In [None]:
trainData = TrainDataset(FILES_AND_LABELS_M1)
testData = TestDataset(TEST_FILES_M1)

trainDataloader = DataLoader(trainData, batch_size= batchSize,
							 shuffle= True, drop_last= True)
testDataLoader = DataLoader(testData, batch_size= batchSize,
							shuffle= True, drop_last= True)

### Model definition

In [None]:
class Encoder(N.Module):
	class PositionalEmbedding(N.Module):
		def __init__(self, dModel):
			super().__init__()
			self.dModel = dModel

		def forward(self, input):
			emb = math.log(10000) / (self.dModel // 2 - 1)
			emb = torch.exp(torch.arange(self.dModel // 2) * -emb)
			emb = input[:, None] * emb[None, :]
			emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
			return emb

	def __init__(self, dModel, maxSeqLen, nLayers, nHeads, ffDim, vocabSize, dropout= 0.1):
		super(Encoder, self).__init__()
		encoder = N.TransformerEncoderLayer(d_model= dModel, nhead= nHeads,
									  		dim_feedforward= ffDim, dropout= dropout,
											batch_first= True)
		
		self.embeddings = N.Embedding(num_embeddings= vocabSize, embedding_dim= dModel).to(device)
		self.posEmb = self.PositionalEmbedding(dModel= dModel)
		self.transformerEncoder = N.TransformerEncoder(encoder_layer= encoder, num_layers= nLayers)
		self.fc1 = N.Linear(in_features= dModel, out_features= dModel//2)
		self.fc2 = N.Linear(in_features= dModel//2, out_features= 2)
		self.fc3 = N.Linear(in_features= maxSeqLen * 2, out_features= 1)

	def forward(self, input, padding_mask):
		embs = self.embeddings(input)
		bs, l, h = embs.shape

		seqIdx = torch.arange(l)
		posEmb = self.posEmb(seqIdx).reshape(1, l, h).expand(bs, l, h).to(device)
		embs = embs + posEmb

		causalMask = torch.triu(torch.ones(l, l), 1).bool().to(device)

		output = self.transformerEncoder(src= embs, mask= causalMask, 
								   		 src_key_padding_mask= padding_mask)

		if(output.isnan().any()):
			raise ValueError
		
		output = self.fc1(output)
		output = self.fc2(output).flatten(1,2)
		output: torch.Tensor = self.fc3(output).squeeze(-1)

		return output

### Model instantiation

In [None]:
model = Encoder(dModel, maxSeqLen, nLayers, nHeads, ffDim, vocabSize)
model.to(device)

summary(
    model,
    input_data=[
        torch.zeros((batchSize, maxSeqLen), dtype= torch.long).to(device),  # input
        torch.zeros((batchSize, maxSeqLen), dtype= torch.bool).to(device)   # padding_mask
    ],
    col_names=['input_size', 'output_size', 'num_params', 'trainable'],
    row_settings=['var_names'],
    verbose=0
)

### Training

In [None]:
opt = torch.optim.Adam(model.parameters(), lr= lr)
loss_fn = N.BCEWithLogitsLoss()

epochSnapshot: list[dict] = []

for epoch in range(epochs):
	model.train()
	clear_output(wait= True)

	print(f"Starting epoch {epoch + 1} of {epochs}\n~~~~~~~~~~~~~~~~~~~~~~~~")
	epochLoss = 0
	epochAcc = 0

	torch.cuda.empty_cache()
	gc.collect()

	for idx, (inputText, groundTruth) in enumerate(trainDataloader):
		currBatchLoss = 0
		currBatchAccuracy = 0
		groundTruth = groundTruth.cuda()

		tokens = tokenizer(inputText, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")

		token_ids = tokens['input_ids'].to(device)
		padding_mask = (~(tokens['attention_mask'].bool())).to(device)
		bs = token_ids.shape[0]
        
        # Shift the input sequence to create the target sequence
		target_ids = torch.cat((token_ids[:, 1:], 
								torch.zeros(bs, 1, device=device).long()), 1)
		
		pred = model(target_ids, padding_mask)
		loss = loss_fn(pred, groundTruth.float())

		opt.zero_grad()
		loss.backward()
		opt.step()

		currBatchLoss = loss.detach().item()
		currBatchAccuracy += ((pred > 0.5).int() == groundTruth).sum()

		epochLoss += currBatchLoss
		epochAcc += (currBatchAccuracy / 16)

		if(idx % 5 == 0):
			print(f"Batch: {idx}; Curr batch loss: {currBatchLoss: 0.5f}; Curr batch acc.: {currBatchAccuracy/16*100:0.2f}")
		
	epochLoss /= len(trainDataloader)
	epochAcc /= len(trainDataloader)
	
	epochSnapshot.append({
		'epoch': f"{epoch}",
		'train_loss': f"{epochLoss:0.7f}",
		'train_acc': f"{epochAcc:0.5f}",
	})

	print(f"Avg train loss: {epochLoss:0.5f}; Avg train acc.: {epochAcc:0.5f}")

### Testing

In [None]:
model.eval()
testingPreds: list[dict] = []
with torch.inference_mode():
	for idx, (articleId, fileId, inputText) in enumerate(testDataLoader):
		tokens = tokenizer(inputText, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")

		token_ids = tokens['input_ids'].to(device)
		padding_mask = (~(tokens['attention_mask'].bool())).to(device)
		bs = token_ids.shape[0]
        
        # Shift the input sequence to create the target sequence
		target_ids = torch.cat((token_ids[:, 1:], 
								torch.zeros(bs, 1, device=device).long()), 1)
		
		preds = model(target_ids, padding_mask)
		
		for i in range(batchSize):
			testingPreds.append({
                "articleId": articleId[i].item(),
                "fileId": fileId[i].item(),
                "prediction": preds[i].item()
			})

### Prepping the submission data

In [None]:
df = pd.DataFrame(testingPreds)

submissions_df = df.loc[df.groupby('articleId')['prediction'].idxmax()].reset_index(drop=True)
submissions_df.drop(columns= ['prediction'], inplace= True)
submissions_df.rename(columns= {
	'articleId': 'id',
	'fileId': 'real_text_id'
}, inplace= True)

submissions_df.to_csv(OUT_FILE_M1, index= False)

## Approach 2: Combined embeddings to linear layers for classification

In [None]:
FILES_AND_LABELS_M2 = "./kaggle/input/fake-or-real-the-impostor-hunt/data/train_files_and_ground_truth-m2.json"
TEST_FILES_M2 = "./kaggle/input/fake-or-real-the-impostor-hunt/data/test_files_m2.json"
OUT_FILE_M2 = "./kaggle/working/fake-or-real-the-impostor-hunt/submission-method-2.csv"
SEP_TOKEN_ID = tokenizer.sep_token_id

maxSeqLen = tokenizer.model_max_length
# +1 because [SEP] token...
tokenizer.model_max_length = (tokenizer.model_max_length * 2) + 1

### Dataset

In [None]:
class ModifiedTrainDataset(Dataset):
	def __init__(self, path):
		super(ModifiedTrainDataset, self).__init__()
		self.__filesAndLabels__: pd.DataFrame = pd.read_json(path)
	
	def __len__(self):
		return self.__filesAndLabels__.shape[0]
	
	def __getfiletext__(self, filePath: str):
		inputText = ""
		with open(filePath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"

		return inputText

	def __getitem__(self, index):
		_, files = self.__filesAndLabels__.iloc[index]
		files = sorted(files, key= lambda x: x['fileId'])

		file1_path = files[0]['filePath']
		file1_label = int(files[0]['label'])
		file1_text = self.__getfiletext__(file1_path)

		file2_path = files[1]['filePath']
		file2_label = int(files[1]['label'])
		file2_text = self.__getfiletext__(file2_path)

		label_tensor = torch.tensor([file1_label, file2_label], dtype= torch.float)

		return file1_text, file2_text, label_tensor

In [None]:
class ModifiedTestDataset(Dataset):
	def __init__(self, path):
		super(ModifiedTestDataset, self).__init__()
		self.__articleAndFiles__: pd.DataFrame = pd.read_json(path)
	
	def __len__(self):
		return self.__articleAndFiles__.shape[0]
	
	def __getfiletext__(self, filePath: str):
		inputText = ""
		with open(filePath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"

		return inputText

	def __getitem__(self, index):
		articleId, files = self.__articleAndFiles__.iloc[index]
		files = sorted(files, key= lambda x: x['fileId'])

		file1_path = files[0]['filePath']
		file1_text = self.__getfiletext__(file1_path)

		file2_path = files[1]['filePath']
		file2_text = self.__getfiletext__(file2_path)

		return articleId.item(), file1_text, file2_text

### Dataloader

In [None]:
modifiedTrainData = ModifiedTrainDataset(FILES_AND_LABELS_M2)
modifiedTestData = ModifiedTestDataset(TEST_FILES_M2)

modTrainDataloader = DataLoader(modifiedTrainData, batch_size= batchSize,
							 shuffle= True, drop_last= True)
modTestDataLoader = DataLoader(modifiedTestData, batch_size= 4,
							shuffle= True, drop_last= True)

### Model definition

In [None]:
class Encoder(N.Module):
	class PositionalEmbedding(N.Module):
		def __init__(self, dModel):
			super().__init__()
			self.dModel = dModel

		def forward(self, input):
			emb = math.log(10000) / (self.dModel // 2 - 1)
			emb = torch.exp(torch.arange(self.dModel // 2) * -emb)
			emb = input[:, None] * emb[None, :]
			emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
			return emb

	def __init__(self, dModel, maxSeqLen, nLayers, nHeads, ffDim, vocabSize, dropout= 0.1):
		super(Encoder, self).__init__()
		encoder = N.TransformerEncoderLayer(d_model= dModel, nhead= nHeads,
									  		dim_feedforward= ffDim, dropout= dropout,
											batch_first= True)
		
		self.embeddings = N.Embedding(num_embeddings= vocabSize, embedding_dim= dModel).to(device)
		self.posEmb = self.PositionalEmbedding(dModel= dModel)
		self.transformerEncoder = N.TransformerEncoder(encoder_layer= encoder, num_layers= nLayers)
		self.fc1 = N.Linear(in_features= dModel, out_features= dModel//2)
		self.fc2 = N.Linear(in_features= dModel//2, out_features= 2)
		self.fc3 = N.Linear(in_features= maxSeqLen * 2, out_features= 2)

	def forward(self, input, padding_mask):
		embs = self.embeddings(input)
		bs, l, h = embs.shape

		seqIdx = torch.arange(l)
		posEmb = self.posEmb(seqIdx).reshape(1, l, h).expand(bs, l, h).to(device)
		embs = embs + posEmb

		causalMask = torch.triu(torch.ones(l, l), 1).bool().to(device)

		output = self.transformerEncoder(src= embs, mask= causalMask, 
								   		 src_key_padding_mask= padding_mask)

		if(output.isnan().any()):
			raise ValueError
		
		output = self.fc1(output)
		output = self.fc2(output).flatten(1,2)
		output = self.fc3(output)

		return output

### Model instantiation

In [None]:
model = Encoder(dModel, tokenizer.model_max_length, nLayers, nHeads, ffDim, vocabSize)
model.to(device)

summary(
    model,
    input_data=[
        torch.zeros((batchSize, tokenizer.model_max_length), dtype= torch.long).to(device),  # input
        torch.zeros((batchSize, tokenizer.model_max_length), dtype= torch.bool).to(device)   # padding_mask
    ],
    col_names=['input_size', 'output_size', 'num_params', 'trainable'],
    row_settings=['var_names'],
    verbose=0
)

### Training

In [None]:
opt = torch.optim.Adam(model.parameters(), lr= lr)
loss_fn = N.BCEWithLogitsLoss()
mla = MultilabelAccuracy()

epochSnapshot: list[dict] = []
sep_token = torch.fill(torch.ones(batchSize, 1), SEP_TOKEN_ID).to(device)
padding_mask_sep_token = torch.fill(torch.ones(batchSize, 1), False).to(device)

for epoch in range(epochs):
	model.train()
	clear_output(wait= True)

	print(f"Starting epoch {epoch + 1} of {epochs}\n~~~~~~~~~~~~~~~~~~~~~~~~")
	epochLoss = 0
	epochAcc = 0

	for idx, (file1_text, file2_text, label_tensor) in enumerate(modTrainDataloader):
		currBatchLoss = 0
		currBatchAccuracy = 0
		
		label_tensor = label_tensor.cuda()
		
		tokens1 = tokenizer(file1_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		tokens2 = tokenizer(file2_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		token_ids1 = tokens1['input_ids'].to(device)
		token_ids2 = tokens2['input_ids'].to(device)
		
		target_ids1 = torch.cat((token_ids1[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)
		target_ids2 = torch.cat((token_ids2[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)
		
		target_ids = torch.cat([target_ids1, sep_token, target_ids2], 1).long()

		padding_mask1 = (~(tokens1['attention_mask'].bool())).to(device)
		padding_mask2 = (~(tokens2['attention_mask'].bool())).to(device)

		padding_mask = torch.cat([padding_mask1, padding_mask_sep_token, 
							padding_mask2], 1)
		
		torch.cuda.empty_cache()
		gc.collect()

		pred = model(target_ids, padding_mask)
		loss = loss_fn(pred, label_tensor)
		
		opt.zero_grad()
		loss.backward()
		opt.step()

		pred_binary = torch.zeros_like(pred)
		pred_binary[torch.arange(pred.size(0)), pred.argmax(dim=1)] = 1
		
		mla.update(pred_binary, label_tensor)
		currBatchLoss = loss.detach().item()
		currBatchAccuracy += (mla.compute() * 100)

		epochLoss += currBatchLoss
		epochAcc += currBatchAccuracy

		if(idx % 5 == 0):
			print(f"Batch: {idx}; Curr batch loss: {currBatchLoss: 0.5f}; Curr batch acc.: {currBatchAccuracy:0.2f}%")
		
	epochLoss /= len(modTrainDataloader)
	epochAcc /= len(modTrainDataloader)
	
	epochSnapshot.append({
		'epoch': f"{epoch}",
		'train_loss': f"{epochLoss:0.7f}",
		'train_acc': f"{epochAcc:0.5f}",
	})

	print(f"Avg train loss: {epochLoss:0.5f}; Avg train acc.: {epochAcc:0.5f}")

### Testing

In [None]:
model.eval()
testingPreds: list[dict] = []
batchSize = 4 
# setting batch size to 4 to ensure no samples are skipped
# and 1068 (number of test files) is divisible by 4
sep_token = torch.fill(torch.ones(batchSize, 1), SEP_TOKEN_ID).to(device)
padding_mask_sep_token = torch.fill(torch.ones(batchSize, 1), False).to(device)

with torch.inference_mode():
	for idx, (articleId, file1_text, file2_text) in enumerate(modTestDataLoader):

		tokens1 = tokenizer(file1_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		tokens2 = tokenizer(file2_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		token_ids1 = tokens1['input_ids'].to(device)
		token_ids2 = tokens2['input_ids'].to(device)
		
		target_ids1 = torch.cat((token_ids1[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)
		target_ids2 = torch.cat((token_ids2[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)
		
		target_ids = torch.cat([target_ids1, sep_token, target_ids2], 1).long()

		padding_mask1 = (~(tokens1['attention_mask'].bool())).to(device)
		padding_mask2 = (~(tokens2['attention_mask'].bool())).to(device)

		padding_mask = torch.cat([padding_mask1, padding_mask_sep_token, 
							padding_mask2], 1)
		
		torch.cuda.empty_cache()
		gc.collect()

		pred = model(target_ids, padding_mask)
		predFileIds = pred.argmax(dim= 1) + 1

		for i in range(batchSize):
			testingPreds.append({
				'id': articleId[i].item(),
				'real_text_id': predFileIds[i].item()
			})

In [None]:
df = pd.DataFrame(testingPreds)
df.to_csv(OUT_FILE_M2, index= False)

## Approach 3: Dual encoder approach

In [3]:
FILES_AND_LABELS_M3 = "./kaggle/input/fake-or-real-the-impostor-hunt/data/train_files_and_ground_truth-m2.json"
TEST_FILES_M3 = "./kaggle/input/fake-or-real-the-impostor-hunt/data/test_files_m2.json"
OUT_FILE_M3 = "./kaggle/working/fake-or-real-the-impostor-hunt/submission-method-3.csv"

maxSeqLen = tokenizer.model_max_length

### Dataset

In [4]:
class ModifiedTrainDataset(Dataset):
	def __init__(self, path):
		super(ModifiedTrainDataset, self).__init__()
		self.__filesAndLabels__: pd.DataFrame = pd.read_json(path)
	
	def __len__(self):
		return self.__filesAndLabels__.shape[0]
	
	def __getfiletext__(self, filePath: str):
		inputText = ""
		with open(filePath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"

		return inputText

	def __getitem__(self, index):
		_, files = self.__filesAndLabels__.iloc[index]
		files = sorted(files, key= lambda x: x['fileId'])

		file1_path = files[0]['filePath']
		file1_label = int(files[0]['label'])
		file1_text = self.__getfiletext__(file1_path)

		file2_path = files[1]['filePath']
		file2_label = int(files[1]['label'])
		file2_text = self.__getfiletext__(file2_path)

		label_tensor = torch.tensor([file1_label, file2_label], dtype= torch.float)

		return file1_text, file2_text, label_tensor

In [5]:
class ModifiedTestDataset(Dataset):
	def __init__(self, path):
		super(ModifiedTestDataset, self).__init__()
		self.__articleAndFiles__: pd.DataFrame = pd.read_json(path)
	
	def __len__(self):
		return self.__articleAndFiles__.shape[0]
	
	def __getfiletext__(self, filePath: str):
		inputText = ""
		with open(filePath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"

		return inputText

	def __getitem__(self, index):
		articleId, files = self.__articleAndFiles__.iloc[index]
		files = sorted(files, key= lambda x: x['fileId'])

		file1_path = files[0]['filePath']
		file1_text = self.__getfiletext__(file1_path)

		file2_path = files[1]['filePath']
		file2_text = self.__getfiletext__(file2_path)

		return articleId.item(), file1_text, file2_text

### Dataloader

In [6]:
modifiedTrainData = ModifiedTrainDataset(FILES_AND_LABELS_M3)
modifiedTestData = ModifiedTestDataset(TEST_FILES_M3)

modTrainDataloader = DataLoader(modifiedTrainData, batch_size= batchSize,
							 shuffle= True, drop_last= True)
modTestDataLoader = DataLoader(modifiedTestData, batch_size= 4,
							shuffle= True, drop_last= True)

### Model definition

In [7]:
class Encoder(N.Module):
	class PositionalEmbedding(N.Module):
		def __init__(self, dModel):
			super().__init__()
			self.dModel = dModel

		def forward(self, input):
			emb = math.log(10000) / (self.dModel // 2 - 1)
			emb = torch.exp(torch.arange(self.dModel // 2) * -emb)
			emb = input[:, None] * emb[None, :]
			emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
			return emb

	def __init__(self, dModel, maxSeqLen, nLayers, nHeads, ffDim, vocabSize, dropout= 0.1):
		super(Encoder, self).__init__()
		encoder = N.TransformerEncoderLayer(d_model= dModel, nhead= nHeads,
									  		dim_feedforward= ffDim, dropout= dropout,
											batch_first= True)
		
		self.embeddings = N.Embedding(num_embeddings= vocabSize, embedding_dim= dModel).to(device)
		self.posEmb = self.PositionalEmbedding(dModel= dModel)
		self.transformerEncoder = N.TransformerEncoder(encoder_layer= encoder, num_layers= nLayers)
		self.fc1 = N.Linear(in_features= dModel, out_features= dModel//2)
		self.fc2 = N.Linear(in_features= dModel//2, out_features= dModel//4)
		self.fc3 = N.Linear(in_features= (maxSeqLen * 2) * dModel//4, out_features= 2)

	def forward(self, input1, padding_mask1, input2, padding_mask2):
		embs1 = self.embeddings(input1)
		bs1, l1, h1 = embs1.shape

		seqIdx1 = torch.arange(l1)
		posEmb1 = self.posEmb(seqIdx1).reshape(1, l1, h1).expand(bs1, l1, h1).to(device)
		embs1 = embs1 + posEmb1

		causalMask1 = torch.triu(torch.ones(l1, l1), 1).bool().to(device)

		output1 = self.transformerEncoder(src= embs1, mask= causalMask1, 
								   		 src_key_padding_mask= padding_mask1)
		
		embs2 = self.embeddings(input2)
		bs2, l2, h2 = embs2.shape

		seqIdx2 = torch.arange(l2)
		posEmb2 = self.posEmb(seqIdx2).reshape(1, l2, h2).expand(bs2, l2, h2).to(device)
		embs2 = embs2 + posEmb2

		causalMask2 = torch.triu(torch.ones(l2, l2), 1).bool().to(device)

		output2 = self.transformerEncoder(src= embs2, mask= causalMask2, 
								   		 src_key_padding_mask= padding_mask2)
		
		output = torch.cat([output1, output2], dim= 1)
		output = self.fc1(output)
		output = self.fc2(output).flatten(1,2)
		output = self.fc3(output)

		return output

### Model instantiation

In [8]:
model = Encoder(dModel, tokenizer.model_max_length, nLayers, nHeads, ffDim, vocabSize)
model.to(device)

summary(
    model,
    input_data=[
        torch.zeros((batchSize, tokenizer.model_max_length), dtype= torch.long).to(device),  # input
        torch.zeros((batchSize, tokenizer.model_max_length), dtype= torch.bool).to(device),   # padding_mask
		torch.zeros((batchSize, tokenizer.model_max_length), dtype= torch.long).to(device),  # input
        torch.zeros((batchSize, tokenizer.model_max_length), dtype= torch.bool).to(device)   # padding_mask
    ],
    col_names=['input_size', 'output_size', 'num_params', 'trainable'],
    row_settings=['var_names'],
    verbose=0
)

Layer (type (var_name))                       Input Shape               Output Shape              Param #                   Trainable
Encoder (Encoder)                             [8, 512]                  [8, 2]                    --                        True
├─Embedding (embeddings)                      [8, 512]                  [8, 512, 1024]            31,254,528                True
├─PositionalEmbedding (posEmb)                [512]                     [512, 1024]               --                        --
├─TransformerEncoder (transformerEncoder)     --                        [8, 512, 1024]            --                        True
│    └─ModuleList (layers)                    --                        --                        (recursive)               True
│    │    └─TransformerEncoderLayer (0)       [8, 512, 1024]            [8, 512, 1024]            6,301,696                 True
│    │    └─TransformerEncoderLayer (1)       [8, 512, 1024]            [8, 512, 1024]        

### Training

In [9]:
opt = torch.optim.Adam(model.parameters(), lr= lr)
loss_fn = N.BCEWithLogitsLoss()
mla = MultilabelAccuracy()

epochSnapshot: list[dict] = []

for epoch in range(epochs):
	model.train()
	clear_output(wait= True)

	print(f"Starting epoch {epoch + 1} of {epochs}\n~~~~~~~~~~~~~~~~~~~~~~~~")
	epochLoss = 0
	epochAcc = 0

	for idx, (file1_text, file2_text, label_tensor) in enumerate(modTrainDataloader):
		currBatchLoss = 0
		currBatchAccuracy = 0
		
		label_tensor = label_tensor.cuda()
		
		tokens1 = tokenizer(file1_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		tokens2 = tokenizer(file2_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		token_ids1 = tokens1['input_ids'].to(device)
		token_ids2 = tokens2['input_ids'].to(device)
		
		target_ids1 = torch.cat((token_ids1[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)
		target_ids2 = torch.cat((token_ids2[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)

		padding_mask1 = (~(tokens1['attention_mask'].bool())).to(device)
		padding_mask2 = (~(tokens2['attention_mask'].bool())).to(device)
		
		torch.cuda.empty_cache()
		gc.collect()

		pred = model(target_ids1, padding_mask1, target_ids2, padding_mask2)
		loss = loss_fn(pred, label_tensor)
		
		opt.zero_grad()
		loss.backward()
		opt.step()

		pred_binary = torch.zeros_like(pred)
		pred_binary[torch.arange(pred.size(0)), pred.argmax(dim=1)] = 1
		
		mla.update(pred_binary, label_tensor)
		currBatchLoss = loss.detach().item()
		currBatchAccuracy += (mla.compute() * 100)

		epochLoss += currBatchLoss
		epochAcc += currBatchAccuracy

		if(idx % 5 == 0):
			print(f"Batch: {idx}; Curr batch loss: {currBatchLoss: 0.5f}; Curr batch acc.: {currBatchAccuracy:0.2f}%")
		
	epochLoss /= len(modTrainDataloader)
	epochAcc /= len(modTrainDataloader)
	
	epochSnapshot.append({
		'epoch': f"{epoch}",
		'train_loss': f"{epochLoss:0.7f}",
		'train_acc': f"{epochAcc:0.5f}",
	})

	print(f"Avg train loss: {epochLoss:0.5f}; Avg train acc.: {epochAcc:0.5f}")

Starting epoch 10 of 10
~~~~~~~~~~~~~~~~~~~~~~~~
Batch: 0; Curr batch loss:  0.00047; Curr batch acc.: 89.38%
Batch: 5; Curr batch loss:  0.00085; Curr batch acc.: 89.88%
Batch: 10; Curr batch loss:  0.00000; Curr batch acc.: 90.34%
Avg train loss: 0.00060; Avg train acc.: 89.87177


### Testing

In [10]:
model.eval()
testingPreds: list[dict] = []
batchSize = 4 
# setting batch size to 4 to ensure no samples are skipped
# and 1068 (number of test files) is divisible by 4

with torch.inference_mode():
	for idx, (articleId, file1_text, file2_text) in enumerate(modTestDataLoader):

		tokens1 = tokenizer(file1_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		tokens2 = tokenizer(file2_text, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")
		
		token_ids1 = tokens1['input_ids'].to(device)
		token_ids2 = tokens2['input_ids'].to(device)
		
		target_ids1 = torch.cat((token_ids1[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)
		target_ids2 = torch.cat((token_ids2[:, 1:], 
								torch.zeros(batchSize, 1, device=device).long()), 1)

		padding_mask1 = (~(tokens1['attention_mask'].bool())).to(device)
		padding_mask2 = (~(tokens2['attention_mask'].bool())).to(device)
		
		torch.cuda.empty_cache()
		gc.collect()

		pred = model(target_ids1, padding_mask1, target_ids2, padding_mask2)
		predFileIds = pred.argmax(dim= 1) + 1

		for i in range(batchSize):
			testingPreds.append({
				'id': articleId[i].item(),
				'real_text_id': predFileIds[i].item()
			})

In [11]:
df = pd.DataFrame(testingPreds)
df.to_csv(OUT_FILE_M3, index= False)