<a href="https://colab.research.google.com/github/VellummyilumVinoth/Toxic_Comment_Classification/blob/main/Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! pip install transformers 

from transformers import DistilBertTokenizer, DistilBertModel


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [4]:
from torch import cuda
device = torch.device('cuda' if cuda.is_available() else 'cpu')

print(f"Current device: {device}")

Current device: cpu


In [5]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 2e-05
NUM_WORKERS = 2

In [6]:
from torch.utils.data import Dataset, DataLoader 

class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len: int, eval_mode: bool = False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.text = dataframe.comment_text
        self.eval_mode = eval_mode 
        if self.eval_mode is False:
            self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        output = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }
                
        if self.eval_mode is False:
            output['targets'] = torch.tensor(self.targets.iloc[index], dtype=torch.float)
                
        return output

In [8]:
model = DistilBertModel.from_pretrained("/content/drive/MyDrive/finetuned_model")
tokenizer = DistilBertTokenizer.from_pretrained('/content/drive/MyDrive/finetuned_distilbert')

In [9]:
import pandas as pd
predict_data = pd.read_csv("/content/drive/MyDrive/preprocessed_Reddit_Data_1.csv")


In [10]:
columns_to_keep = ['ID', 'Title']
predict_data = predict_data[columns_to_keep]
predict_data.rename(columns={"Title": "comment_text"}, inplace=True)

In [11]:
predict_data

Unnamed: 0,ID,comment_text
0,y7gz80,UkrainianConflict Discussion Megathread
1,10e17wq,Zelenskyy survives over 12 assassination attem...
2,10digs3,In the first round of presidential elections i...
3,10dv085,"A further 20,000 Ukrainian recruits will be tr..."
4,10duei9,"Zelensky: ""Tanks, APCs and artillery are exact..."
...,...,...
971,109fbyj,Hundreds of US military vehicles arrive in Dut...
972,1096adh,BREAKING: Poland will deliver a company of Leo...
973,109envz,Russian airline aircraft suffer massive breakd...
974,1095iye,"The Russian Federation declared that it ""has t..."


In [12]:
predict_set = MultiLabelDataset(predict_data, tokenizer, MAX_LEN, eval_mode = True)
predicting_params = {'batch_size': TRAIN_BATCH_SIZE,
                     'shuffle': True,
                     'num_workers': 2
                    }
predict_loader = DataLoader(predict_set, **predicting_params)

In [13]:
all_predict_pred = []

def predict(epoch):
    model.eval()
    
    with torch.inference_mode():
    
        for _, data in tqdm(enumerate(predict_loader, 0)):

            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            rounded_probas = torch.round(probas)  # Round probabilities to 0 or 1

            all_predict_pred.append(probas)

    return probas

In [15]:
from tqdm import tqdm

probas = predict(model)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
0it [00:00, ?it/s]


RuntimeError: ignored

In [None]:
all_predict_pred = torch.cat(all_predict_pred)

In [None]:
final_df = predict_data.copy()


In [None]:
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

for i,name in enumerate(label_columns):

    final_df[name] = all_predict_pred[:, i].cpu()
    final_df.head()

In [None]:
final_df.to_csv('final.csv', index=False)

In [None]:
final_df