In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Importing important libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Creating Dataset and DataLoader for neural net
class DetoxDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Model initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)  # We have 6 labels

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
checkpoint_path = "/content/drive/MyDrive/checkpoint_epoch_1.pth"  #Loading the checkpoint
checkpoint = torch.load(checkpoint_path,map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
test = pd.read_csv('/content/drive/MyDrive/pewdiepie_processed.csv',engine='python',encoding='utf-8',dtype=str)
test

Unnamed: 0,0
0,zee music company 106m subscribe
1,i just came to watch this to know how pewdiepi...
2,the funny thing is that s what mr beast said h...
3,mrbeast really passed you know lol
4,now
...,...
21540,nice
21541,nice
21542,finally the true first comment
21543,bigolcat l


In [None]:
class UnseenDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long)
        }

new_names = {'0': 'comment_text'}
test=test.rename(columns=new_names)
unseen_set = UnseenDataset(test, tokenizer, MAX_LEN)
unseen_loader = DataLoader(unseen_set, batch_size=16, shuffle=False)


In [None]:
def predict(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(data_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)
            logits = outputs.logits
            probabilities = torch.sigmoid(logits).cpu().numpy()
            predictions.extend(probabilities)
    return predictions

# Get predictions for the unseen dataset
unseen_predictions = predict(model, unseen_loader)

In [None]:
test_df = test.copy()
test_df['labels']=unseen_predictions
test_df

Unnamed: 0,comment_text,labels
0,zee music company 106m subscribe,"[0.0020367468, 0.00046299907, 0.0006722487, 0...."
1,i just came to watch this to know how pewdiepi...,"[0.002775316, 0.00037435818, 0.0006954999, 0.0..."
2,the funny thing is that s what mr beast said h...,"[0.44289106, 0.0056199166, 0.02697604, 0.03176..."
3,mrbeast really passed you know lol,"[0.027319422, 0.00037395136, 0.0019268054, 0.0..."
4,now,"[0.0023457299, 0.00040349388, 0.00065790786, 0..."
...,...,...
21540,nice,"[0.002306302, 0.00045429685, 0.0007195816, 0.0..."
21541,nice,"[0.002306302, 0.00045429685, 0.0007195816, 0.0..."
21542,finally the true first comment,"[0.00227003, 0.00042950665, 0.0006620119, 0.00..."
21543,bigolcat l,"[0.019050244, 0.000479967, 0.002404789, 0.0005..."


In [None]:
final_predictions = [(probs >= 0.5).astype(int) for probs in unseen_predictions]
len(final_predictions)

21545

In [None]:
fin_df = test.copy()

In [None]:
fin_df['predictions'] = final_predictions
fin_df

Unnamed: 0,comment_text,predictions
0,zee music company 106m subscribe,"[0, 0, 0, 0, 0, 0]"
1,i just came to watch this to know how pewdiepi...,"[0, 0, 0, 0, 0, 0]"
2,the funny thing is that s what mr beast said h...,"[0, 0, 0, 0, 0, 0]"
3,mrbeast really passed you know lol,"[0, 0, 0, 0, 0, 0]"
4,now,"[0, 0, 0, 0, 0, 0]"
...,...,...
21540,nice,"[0, 0, 0, 0, 0, 0]"
21541,nice,"[0, 0, 0, 0, 0, 0]"
21542,finally the true first comment,"[0, 0, 0, 0, 0, 0]"
21543,bigolcat l,"[0, 0, 0, 0, 0, 0]"


In [None]:
fin_df.to_csv('/content/drive/MyDrive/pewdiepie_results.csv')