In [1]:
import torch
import pandas as pd
import torch.nn as N
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import math
from torchinfo import summary
from IPython.display import clear_output
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FILES_AND_LABELS = "./kaggle/input/fake-or-real-the-impostor-hunt/data/train_files_and_ground_truth.csv"
TEST_FILES = "./kaggle/input/fake-or-real-the-impostor-hunt/data/test_files.csv"
OUT_FILE = "./kaggle/working/fake-or-real-the-impostor-hunt/submission.csv"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batchSize = 16
dModel = 1024
maxSeqLen = tokenizer.model_max_length
vocabSize = len(tokenizer.get_vocab())
nLayers = 8
nHeads = 8
ffDim = 2048

lr = 1e-4

epochs = 10

### Dataset

In [3]:
class TrainDataset(Dataset):
	def __init__(self, path):
		super(TrainDataset, self).__init__()
		self.__filesAndLabels__: pd.DataFrame = pd.read_csv(path)
	
	def __len__(self):
		return self.__filesAndLabels__.shape[0]
	
	def __getitem__(self, index):
		inputPath, groundTruth = self.__filesAndLabels__.iloc[index]
		inputText = ""

		with open(inputPath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"
		
		return inputText, groundTruth
		

In [4]:
class TestDataset(Dataset):
	def __init__(self, path):
		super(TestDataset, self).__init__()
		self.__filesAndLabels__: pd.DataFrame = pd.read_csv(path)
	
	def __len__(self):
		return self.__filesAndLabels__.shape[0]
	
	def __getitem__(self, index):
		articleId, fileId, inputPath = self.__filesAndLabels__.iloc[index]
		inputText = ""

		with open(inputPath, 'r') as f:
			for line in f.readlines():
				inputText += line + "\n"
		
		return articleId, fileId, inputText

### Dataloader

In [5]:
trainData = TrainDataset(FILES_AND_LABELS)
testData = TestDataset(TEST_FILES)

trainDataloader = DataLoader(trainData, batch_size= batchSize,
							 shuffle= True, drop_last= True)
testDataLoader = DataLoader(testData, batch_size= batchSize,
							shuffle= True, drop_last= True)

### Model definition

In [6]:
class Encoder(N.Module):
	class PositionalEmbedding(N.Module):
		def __init__(self, dModel):
			super().__init__()
			self.dModel = dModel

		def forward(self, input):
			emb = math.log(10000) / (self.dModel // 2 - 1)
			emb = torch.exp(torch.arange(self.dModel // 2) * -emb)
			emb = input[:, None] * emb[None, :]
			emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
			return emb

	def __init__(self, dModel, maxSeqLen, nLayers, nHeads, ffDim, vocabSize, dropout= 0.1):
		super(Encoder, self).__init__()
		encoder = N.TransformerEncoderLayer(d_model= dModel, nhead= nHeads,
									  		dim_feedforward= ffDim, dropout= dropout,
											batch_first= True)
		
		self.embeddings = N.Embedding(num_embeddings= vocabSize, embedding_dim= dModel).to(device)
		self.posEmb = self.PositionalEmbedding(dModel= dModel)
		self.transformerEncoder = N.TransformerEncoder(encoder_layer= encoder, num_layers= nLayers)
		self.fc1 = N.Linear(in_features= dModel, out_features= dModel//2)
		self.fc2 = N.Linear(in_features= dModel//2, out_features= 2)
		self.fc3 = N.Linear(in_features= maxSeqLen * 2, out_features= 1)

	def forward(self, input, padding_mask):
		embs = self.embeddings(input)
		bs, l, h = embs.shape

		seqIdx = torch.arange(l)
		posEmb = self.posEmb(seqIdx).reshape(1, l, h).expand(bs, l, h).to(device)
		embs = embs + posEmb

		causalMask = torch.triu(torch.ones(l, l), 1).bool().to(device)

		output = self.transformerEncoder(src= embs, mask= causalMask, 
								   		 src_key_padding_mask= padding_mask)

		if(output.isnan().any()):
			raise ValueError
		
		output = self.fc1(output)
		output = self.fc2(output).flatten(1,2)
		output: torch.Tensor = self.fc3(output).squeeze(-1)

		return output

### Model instantiation

In [7]:
model = Encoder(dModel, maxSeqLen, nLayers, nHeads, ffDim, vocabSize)
model.to(device)

summary(
    model,
    input_data=[
        torch.zeros((batchSize, maxSeqLen), dtype= torch.long).to(device),  # input
        torch.zeros((batchSize, maxSeqLen), dtype= torch.bool).to(device)   # padding_mask
    ],
    col_names=['input_size', 'output_size', 'num_params', 'trainable'],
    row_settings=['var_names'],
    verbose=0
)

Layer (type (var_name))                       Input Shape               Output Shape              Param #                   Trainable
Encoder (Encoder)                             [16, 512]                 [16]                      --                        True
├─Embedding (embeddings)                      [16, 512]                 [16, 512, 1024]           31,254,528                True
├─PositionalEmbedding (posEmb)                [512]                     [512, 1024]               --                        --
├─TransformerEncoder (transformerEncoder)     --                        [16, 512, 1024]           --                        True
│    └─ModuleList (layers)                    --                        --                        --                        True
│    │    └─TransformerEncoderLayer (0)       [16, 512, 1024]           [16, 512, 1024]           8,399,872                 True
│    │    └─TransformerEncoderLayer (1)       [16, 512, 1024]           [16, 512, 1024]       

### Training

In [8]:
opt = torch.optim.Adam(model.parameters(), lr= lr)
loss_fn = N.BCEWithLogitsLoss()

epochSnapshot: list[dict] = []

for epoch in range(epochs):
	model.train()
	clear_output(wait= True)

	print(f"Starting epoch {epoch + 1} of {epochs}\n~~~~~~~~~~~~~~~~~~~~~~~~")
	epochLoss = 0
	epochAcc = 0

	torch.cuda.empty_cache()
	gc.collect()

	for idx, (inputText, groundTruth) in enumerate(trainDataloader):
		currBatchLoss = 0
		currBatchAccuracy = 0
		groundTruth = groundTruth.cuda()

		tokens = tokenizer(inputText, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")

		token_ids = tokens['input_ids'].to(device)
		padding_mask = (~(tokens['attention_mask'].bool())).to(device)
		bs = token_ids.shape[0]
        
        # Shift the input sequence to create the target sequence
		target_ids = torch.cat((token_ids[:, 1:], 
								torch.zeros(bs, 1, device=device).long()), 1)
		
		pred = model(target_ids, padding_mask)
		loss = loss_fn(pred, groundTruth.float())

		opt.zero_grad()
		loss.backward()
		opt.step()

		currBatchLoss = loss.detach().item()
		currBatchAccuracy += ((pred > 0.5).int() == groundTruth).sum()

		epochLoss += currBatchLoss
		epochAcc += (currBatchAccuracy / 16)

		if(idx % 5 == 0):
			print(f"Batch: {idx}; Curr batch loss: {currBatchLoss: 0.5f}; Curr batch acc.: {currBatchAccuracy/16*100:0.2f}")
		
	epochLoss /= len(trainDataloader)
	epochAcc /= len(trainDataloader)
	
	epochSnapshot.append({
		'epoch': f"{epoch}",
		'train_loss': f"{epochLoss:0.7f}",
		'train_acc': f"{epochAcc:0.5f}",
	})

	print(f"Avg train loss: {epochLoss:0.5f}; Avg train acc.: {epochAcc:0.5f}")

Starting epoch 10 of 10
~~~~~~~~~~~~~~~~~~~~~~~~
Batch: 0; Curr batch loss:  0.01250; Curr batch acc.: 100.00
Batch: 5; Curr batch loss:  0.00233; Curr batch acc.: 100.00
Batch: 10; Curr batch loss:  0.00176; Curr batch acc.: 100.00
Avg train loss: 0.01775; Avg train acc.: 1.00000


### Testing

In [9]:
model.eval()
testingPreds: list[dict] = []
with torch.inference_mode():
	for idx, (articleId, fileId, inputText) in enumerate(testDataLoader):
		tokens = tokenizer(inputText, padding= "max_length", 
					 	   truncation= True, max_length= maxSeqLen,
						   return_tensors= "pt")

		token_ids = tokens['input_ids'].to(device)
		padding_mask = (~(tokens['attention_mask'].bool())).to(device)
		bs = token_ids.shape[0]
        
        # Shift the input sequence to create the target sequence
		target_ids = torch.cat((token_ids[:, 1:], 
								torch.zeros(bs, 1, device=device).long()), 1)
		
		preds = model(target_ids, padding_mask)
		
		for i in range(batchSize):
			testingPreds.append({
                "articleId": articleId[i].item(),
                "fileId": fileId[i].item(),
                "prediction": preds[i].item()
			})

### Prepping the submission data

In [12]:
df = pd.DataFrame(testingPreds)

submissions_df = df.loc[df.groupby('articleId')['prediction'].idxmax()].reset_index(drop=True)
submissions_df.drop(columns= ['prediction'], inplace= True)
submissions_df.rename(columns= {
	'articleId': 'id',
	'fileId': 'real_text_id'
}, inplace= True)

submissions_df.to_csv(OUT_FILE, index= False)