In [1]:
# Importing libraries
import torch, os
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset

2024-11-06 12:26:17.647246: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 12:26:17.705815: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 12:26:17.721586: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 12:26:17.829085: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Switching to CUDA for faster training and inferencing
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
# Using Google's BERT model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [4]:
# Using the labels obtained from preprocessing notebook file
id2label = {0: 'RapeGang Rape RGRSexually Abusive Content',
 1: 'Hacking  Damage to computercomputer system etc',
 2: 'Report Unlawful Content',
 3: 'Online Financial Fraud',
 4: 'Cyber Terrorism',
 5: 'Online Gambling  Betting',
 6: 'Crime Against Women & Children',
 7: 'Ransomware',
 8: 'Online Cyber Trafficking',
 9: 'Sexually Explicit Act',
 10: 'Child Pornography CPChild Sexual Abuse Material CSAM',
 11: 'Sexually Obscene material',
 12: 'Any Other Cyber Crime',
 13: 'Cyber Attack/ Dependent Crimes',
 14: 'Cryptocurrency Crime',
 15: 'Online and Social Media Related Crime'}

In [5]:
# Using the labels obtained from preprocessing notebook file
label2id = {'RapeGang Rape RGRSexually Abusive Content': 0,
 'Hacking  Damage to computercomputer system etc': 1,
 'Report Unlawful Content': 2,
 'Online Financial Fraud': 3,
 'Cyber Terrorism': 4,
 'Online Gambling  Betting': 5,
 'Crime Against Women & Children': 6,
 'Ransomware': 7,
 'Online Cyber Trafficking': 8,
 'Sexually Explicit Act': 9,
 'Child Pornography CPChild Sexual Abuse Material CSAM': 10,
 'Sexually Obscene material': 11,
 'Any Other Cyber Crime': 12,
 'Cyber Attack/ Dependent Crimes': 13,
 'Cryptocurrency Crime': 14,
 'Online and Social Media Related Crime': 15}

In [6]:
# Creating the model and sending it to CUDA
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=16, id2label=id2label, label2id=label2id)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
# Reading the processed file
import pandas as pd
train_df = pd.read_csv('train_processed.csv')
test_df = pd.read_csv('test_processed.csv')

In [8]:
# Splitting into texts and labels for training, validating and testing

SIZE= test_df.shape[0]

train_texts= list(train_df.crimeaditionalinfo)
train_labels= list(train_df.labels)

val_texts=   list(test_df.crimeaditionalinfo[:SIZE//2])
val_labels=   list(test_df.labels[:SIZE//2])

test_texts=  list(test_df.crimeaditionalinfo[SIZE//2:])
test_labels=  list(test_df.labels[SIZE//2:])

In [9]:
len(train_texts), len(val_texts), len(test_texts)

(92441, 15418, 15418)

In [10]:
# Tokenizing into encoddings
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [11]:
class DataLoader(Dataset):
    """
    Custom Dataset class for handling tokenized text data and corresponding labels.
    Inherits from torch.utils.data.Dataset.
    """
    def __init__(self, encodings, labels):
        """
        Initializes the DataLoader class with encodings and labels.

        Args:
            encodings (dict): A dictionary containing tokenized input text data
                              (e.g., 'input_ids', 'token_type_ids', 'attention_mask').
            labels (list): A list of integer labels for the input text data.
        """
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        """
        Returns a dictionary containing tokenized data and the corresponding label for a given index.

        Args:
            idx (int): The index of the data item to retrieve.

        Returns:
            item (dict): A dictionary containing the tokenized data and the corresponding label.
        """
        # Retrieve tokenized data for the given index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Add the label for the given index to the item dictionary
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        """
        Returns the number of data items in the dataset.

        Returns:
            (int): The number of data items in the dataset.
        """
        return len(self.labels)

In [12]:
train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)
test_dataset = DataLoader(test_encodings, test_labels)

In [13]:
from transformers import TrainingArguments, Trainer

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.
    
    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of 
              that observation belonging to a certain class.
              
    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids
    
    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)
    
    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    
    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    
    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [15]:
training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./model', 
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0 
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=32,
    # Number of steps used for a linear warmup
    warmup_steps=100,                
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory                 
    logging_dir='./multi-class-logs',            
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps", 
    fp16=True,
    load_best_model_at_end=True
)



In [16]:
trainer = Trainer(
    # The pre-trained model that will be fine-tuned 
    model=model,
    # Training arguments that we defined above                        
    args=training_args,                 
    train_dataset=train_dataloader,         
    eval_dataset=val_dataloader,            
    compute_metrics= compute_metrics
)

In [17]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,1.93,1.379217,0.605072,0.050263,0.040338,0.066667
100,1.1009,1.074231,0.702037,0.205609,0.216677,0.209014
150,0.9252,0.915932,0.714295,0.223381,0.226589,0.237982
200,0.8217,0.86902,0.733234,0.223048,0.228755,0.235125
250,0.8235,0.851976,0.732715,0.242072,0.240151,0.246691
300,0.953,0.825096,0.739785,0.230583,0.238023,0.233863
350,0.8651,0.804614,0.73985,0.23319,0.241614,0.24187
400,0.8368,0.806812,0.735309,0.226479,0.242022,0.238622
450,0.7439,0.791155,0.741666,0.228492,0.251183,0.240253
500,0.7972,0.791787,0.745363,0.230398,0.245408,0.237671


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=17334, training_loss=0.6185594653125946, metrics={'train_runtime': 7913.2861, 'train_samples_per_second': 35.045, 'train_steps_per_second': 2.19, 'total_flos': 7.297591915133338e+16, 'train_loss': 0.6185594653125946, 'epoch': 3.0})

In [22]:
# Evaluating on train, test and validation dataset

test = [trainer.evaluate(eval_dataset=df_org) for df_org in [train_dataloader, val_dataloader, test_dataset]]
pd.DataFrame(test, index=["train","val","test"]).iloc[:,:5]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,0.534825,0.816997,0.40568,0.583428,0.373613
val,0.647605,0.772149,0.363562,0.460873,0.335566
test,0.656852,0.769555,0.363122,0.449258,0.341732


In [23]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

In [24]:
def predict(text):
    """
    Predicts the class label for a given input text

    Args:
        text (str): The input text for which the class label needs to be predicted.

    Returns:
        probs (torch.Tensor): Class probabilities for the input text.
        pred_label_idx (torch.Tensor): The index of the predicted class label.
        pred_label (str): The predicted class label.
    """
    # Tokenize the input text and move tensors to the GPU if available
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")

    # Get model output (logits)
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    """ Explanation outputs: The BERT model returns a tuple containing the output logits (and possibly other elements depending on the model configuration). In this case, the output logits are the first element in the tuple, which is why we access it using outputs[0].

    outputs[0]: This is a tensor containing the raw output logits for each class. The shape of the tensor is (batch_size, num_classes) where batch_size is the number of input samples (in this case, 1, as we are predicting for a single input text) and num_classes is the number of target classes.

    softmax(1): The softmax function is applied along dimension 1 (the class dimension) to convert the raw logits into class probabilities. Softmax normalizes the logits so that they sum to 1, making them interpretable as probabilities. """

    # Get the index of the class with the highest probability
    # argmax() finds the index of the maximum value in the tensor along a specified dimension.
    # By default, if no dimension is specified, it returns the index of the maximum value in the flattened tensor.
    pred_label_idx = probs.argmax()

    # Now map the predicted class index to the actual class label 
    # Since pred_label_idx is a tensor containing a single value (the predicted class index), 
    # the .item() method is used to extract the value as a scalar
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label


In [25]:
# Saving the model
model_path = "indiaai-text-classification-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('indiaai-text-classification-model/tokenizer_config.json',
 'indiaai-text-classification-model/special_tokens_map.json',
 'indiaai-text-classification-model/vocab.txt',
 'indiaai-text-classification-model/added_tokens.json',
 'indiaai-text-classification-model/tokenizer.json')

In [29]:
# Loading the saved model
model_path = "indiaai-text-classification-model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device="cuda")

In [30]:
# Testing
nlp("spam message i recieve msg from unwanted number they say you take loan and today is repayment date but i did not take loani recieve text and whatsapp message any time for make repayment of loan but i did not take any loan")

[{'label': 'Online Financial Fraud', 'score': 0.5689788460731506}]

In [31]:
nlp("The issue actually started when I got this email, which at first glance seemed like spam. I usually ignore such things, but for some reason, I opened it this time. Afterward, my phone started freezing, and it wasnâ€™t normal. I thought it was a technical glitch, but now Iâ€™m not so sure. Honestly, itâ€™s such a mess, and I donâ€™t even know where to start. Iâ€™ve contacted support, but they keep giving me the runaround. The more I tried to fix it, the more problems came up. My email was stange, and then I got locked out of everything. This has really left me anxious and frustrated, I canâ€™t sleep knowing my information is out there. Iâ€™ve even considered deleting all my accounts, but thatâ€™s not a solution. Even my tablet isnâ€™t working right anymore. Itâ€™s so stange how everything just fell apart after that one email.")

[{'label': 'Cyber Attack/ Dependent Crimes', 'score': 0.9991111159324646}]

In [33]:
nlp("Asking money by my name Someone created my facebook clone account and pretending as me and asking money on messenger My friends are calling and telling this I have posted on all my original account as someone is doing this behalf of me")

[{'label': 'Online and Social Media Related Crime',
  'score': 0.836469829082489}]

In [34]:
nlp("""My son was applying for pan card he searched on Google the st website which appeared there was thepancardcom he filled in details did the transaction then he got a form printed it signed it pasted his photograph attached his aadhar card and posted it on the mail address ThePanCardcom
 fourth floorTower 
Shakhti towers
Anna Salai
Chennai
The  digit customer code number for that website is A
""")

[{'label': 'Online Financial Fraud', 'score': 0.8545343279838562}]