In [None]:
import torch;
import torch.nn as nn;
import torch.nn.functional as F;
from sentence_transformers import SentenceTransformer;

In [None]:
# sent_model = SentenceTransformer('t5-base', device='cuda')
sent_model = SentenceTransformer('sentence-transformers/sentence-t5-base', device='cuda')

In [4]:
def embed(x, input_dim=1682):
        encoding = sent_model.encode(x)
        t = torch.from_numpy(encoding)
        pad_size = input_dim - t.size(0)
        padded_tensor = F.pad(t, (0, 0, 0, pad_size))
        return padded_tensor

embedding = embed(["This is a warning", "And this is one too", "this is some more"])

In [175]:
class sponsoredBye(nn.Module):
    def __init__(self, model, embedding_dim=768, inp=1682, hidden_dim=128, output_dim=2):
        super().__init__()
        self.device = 'cuda'
        self.input_dim = inp
        self.sentence_transformer = model
        # Add padding
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=2, 
                           bidirectional=True, 
                           dropout=0.2,
                           batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.softmax = nn.Softmax(dim=-1)
        
        # Set the embedding model to not be trainable
        for param in self.sentence_transformer.parameters():
            param.requires_grad = False

    def embed(self, x):
        encoding = self.sentence_transformer.encode(x)
        t = torch.from_numpy(encoding).to(self.device)
        pad_size = self.input_dim - t.size(0)
        padded_tensor = F.pad(t, (0, 0, 0, pad_size)).to(self.device)
        return padded_tensor

    def forward(self, x):
#         embedded = self.embed(x)
#         a = embedded.unsqueeze(0)
        outputs, (hidden, cell) = self.lstm(x)
        predictions = self.fc(self.dropout(outputs))
        x = self.softmax(predictions)
#         y = torch.argmax(x, dim=2)
        return x

model = sponsoredBye(sent_model)
model.to('cuda');


In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,315,330 trainable parameters


### Data Collection

In [22]:
import numpy as np

In [7]:
# Create a test dataset
from pymongo import MongoClient
from tqdm import tqdm

cnt = 0
dataset_size = 100
sentences = []
labels = []

client = MongoClient("mongodb://49.13.173.177:27020/")
cursor = client.sponsoredbye.clean.find({}, {'new_clean': 1, 'new_labels': 1})
for elem in tqdm(cursor):
    if cnt >= dataset_size:
        break
    if type(elem['new_clean']) is not str:
        continue
    if type(elem['new_labels']) is not list:
        continue
    sentences.append(elem['new_clean'].replace(". ", "<k>").replace("? ", "<k>").split("<k>"))
    labels.append(elem['new_labels'])
    cnt += 1

105it [00:20,  5.03it/s]


In [36]:
end = [(x,y) for x,y in zip(sentences, labels) if len(x) == len(y)]

In [119]:
sentences = [x[0] for x in end]
labels = [x[1] for x in end]

In [120]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        return sentence, torch.tensor(label, dtype=torch.long)

In [139]:
import torch.nn.functional as F
from torch.utils.data import DataLoader

def collate_fn(batch):
    sentences, labels = zip(*batch)
    
    # Encode sentences using the SentenceTransformer model
    encoded_sentences = [model.sentence_transformer.encode(sentence) for sentence in sentences]

    # Find the longest encoded sentence in the batch
    max_len = max([encoding.shape[0] for encoding in encoded_sentences])

    # Pad sentences to the same length
    padded_sentences = []
    padded_labels = []
    for encoding, label in list(zip(encoded_sentences, labels)):
        tensor_encoding = torch.tensor(encoding, dtype=torch.float)
        pad_size = max_len - tensor_encoding.size(0)
        padded_tensor = F.pad(tensor_encoding, (0, 0, 0, pad_size))
        padded_sentences.append(padded_tensor)
        
        padded_tensor_l = F.pad(label, (0, pad_size))
        padded_labels.append(padded_tensor_l)
        
    
    # Stack sentences into a tensor
    sentences_tensor = torch.stack(padded_sentences)
    # Convert labels to tensor
#     print(labels)
    labels_tensor = torch.stack(padded_labels)
    
    return sentences_tensor, labels_tensor

In [171]:
dataset = TextDataset(sentences, labels)

# Create the DataLoader with the custom collate function
dataloader = DataLoader(dataset, batch_size=50, shuffle=True, collate_fn=collate_fn)

In [176]:
import torch.optim as optim

# Initialize the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 10  # Set the number of epochs

for epoch in range(num_epochs):
    for batch in dataloader:
        s_f, l_f = batch

        optimizer.zero_grad()
        s_f, l_f = s_f.to("cuda"), l_f.to("cuda")

        outputs = model(s_f)
#         print(f"Model output dimension: {outputs.shape}, Label dimension: {l_f.shape}")

        loss = criterion(outputs.view(-1, outputs.size(-1)), l_f.view(-1)) 
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.6847
Epoch [1/10], Loss: 0.6550
Epoch [2/10], Loss: 0.6320
Epoch [2/10], Loss: 0.6078
Epoch [3/10], Loss: 0.5798
Epoch [3/10], Loss: 0.5362
Epoch [4/10], Loss: 0.5055
Epoch [4/10], Loss: 0.4955
Epoch [5/10], Loss: 0.4619
Epoch [5/10], Loss: 0.3857
Epoch [6/10], Loss: 0.3992
Epoch [6/10], Loss: 0.3961
Epoch [7/10], Loss: 0.3739
Epoch [7/10], Loss: 0.4150
Epoch [8/10], Loss: 0.3926
Epoch [8/10], Loss: 0.4075
Epoch [9/10], Loss: 0.4251
Epoch [9/10], Loss: 0.3792
Epoch [10/10], Loss: 0.4115
Epoch [10/10], Loss: 0.3707


In [177]:
example = ["Welcome to my new youtrube video", "this video is sponsored by", "raid shadow legends","lets get back into the video", "pokemon is a wild phenomenon"]

In [188]:
torch.argmax(model.forward(model.embed(example))[:5], dim=-1)

tensor([0, 0, 0, 0, 0], device='cuda:0')

'poll'

['I polli',
 ' sono animali domestici',
 ' A loro piace cantare',
 ' ma sono carini',
 '']