In [2]:
import os
import gc
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(1)

import regex as re
from sklearn import metrics
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys
from pathlib import Path
# from time import datetime
from sklearn.model_selection import train_test_split
from datetime import datetime, date
TODAY_DATETIME = str(datetime.now())[:16].replace('-','').replace(' ','_').replace(':','')

# from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from tqdm import tqdm

# Define constants and paths
USER_FOLDER = "/home/202462003"
TODAY_DATETIME = str(datetime.now())[:16].replace('-', '').replace(' ', '_').replace(':', '')
CKPT_PATH = f"{USER_FOLDER}/checkpoints/distilbert-base/AllLayers/{TODAY_DATETIME}"
BEST_MODEL_PATH = f"{CKPT_PATH}/best_model.pt"

# Dynamic user directory path setting
user = os.path.dirname(os.path.realpath(sys.argv[0])).split('/')[2]
processed_dir_path = f"/home/{user}"
training_dataset_pathname = f"{processed_dir_path}/train.csv"
validation_dataset_pathname = f"{processed_dir_path}/test.csv"

# Load the datasets
train_df = pd.read_csv(training_dataset_pathname)
test_df = pd.read_csv(validation_dataset_pathname)

# Drop rows with NaN in the 'category' column
train_df = train_df.dropna(subset=['category'])
train_df = train_df.dropna(subset=['sub_category'])
train_df = train_df.dropna(subset=['crimeaditionalinfo'])

# Drop rows with NaN in the 'category' column
test_df = train_df.dropna(subset=['category'])
test_df = train_df.dropna(subset=['sub_category'])
test_df = train_df.dropna(subset=['crimeaditionalinfo'])


# Drop rows with NaN in the 'category' column
train_df = train_df.dropna(subset=['category'])

# Filter out classes with fewer than 2 samples to allow stratification
train_df = train_df.groupby('category').filter(lambda x: len(x) > 1)

# First split: 60% for training, 40% remaining
train_data, temp_data = train_test_split(
    train_df, test_size=0.4, random_state=42, stratify=train_df['category']
)

# Second split: split remaining 40% into 20% validation and 20% test
val_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42, stratify=temp_data['category']
)

# Print the sizes of each dataset
print(f"Training data shape: {train_data.shape} (60%)")
print(f"Validation data shape: {val_data.shape} (20%)")
print(f"Test data shape: {test_data.shape} (20%)")
print(f"Original test dataset shape (from file): {test_df.shape}")

# Get sorted list of unique target categories
target_list = sorted(train_df['category'].unique())
print("Target categories:", target_list)

# Cleanup
del train_df, temp_data
gc.collect()


Training data shape: (52243, 3) (60%)
Validation data shape: (17415, 3) (20%)
Test data shape: (17415, 3) (20%)
Original test dataset shape (from file): (87074, 3)
Target categories: ['Any Other Cyber Crime', 'Cryptocurrency Crime', 'Cyber Attack/ Dependent Crimes', 'Cyber Terrorism', 'Hacking  Damage to computercomputer system etc', 'Online Cyber Trafficking', 'Online Financial Fraud', 'Online Gambling  Betting', 'Online and Social Media Related Crime', 'Ransomware']


58

In [3]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['crimeaditionalinfo']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

def maxindex(data_list):
    index=0
    for i in range(len(data_list)):
        if (data_list[i]>data_list[index]):
            index=i
    return index

def accuracy(predictions, targets):
    batch_size=len(predictions)
    count=0.0
    for i in range(batch_size):
        if maxindex(predictions[i]) == maxindex(targets[i]):
            count=count+1
            # print(count)
    # print(f"predictions: {predictions[0]}")
    # print(f"targets: {targets[0]}")
    
    return count*(100.0/batch_size)
    



def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path, epoch):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
    f_path = checkpoint_path + '/ckpt_epoch' + str(epoch) + '.pt'    
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)


class SequenceClassifier(torch.nn.Module):
    def __init__(self,num_labels):
        super(SequenceClassifier, self).__init__()
        self.classifier_model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels = num_labels)
    
    def forward(self, input_ids, attn_mask):
        output = self.classifier_model(
            input_ids,
            attention_mask=attn_mask
        )
        # print('output Tensor:')
        # print(f"{output}")
        return output



def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path): 
   
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf


    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0
        total_accuracy = 0.0
        total_validation_accuracy = 0.0

        model.train()
        pbar = tqdm(total=num_train_batches,colour='green')
        lossbar = tqdm(total=1.0)
        bw_train_loss = []
        print('############# Epoch {}: Training Start   #############'.format(epoch))
        for batch_idx, data in enumerate(training_loader):
            #print('yyy epoch', batch_idx)
            pbar.update(1)
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            # print(f"targets:\n{targets}")

            outputs = model(ids, mask).logits
            
            # print(f"outputs:\n{outputs}")

            # _, predicted = torch.max(outputs.data, 1)
            
            batch_accuracy = accuracy(outputs, targets)
            total_accuracy += batch_accuracy

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            # print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
            # if batch_idx%5000==0:
             #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # print('before loss data in training', loss.item(), train_loss)
            bw_train_loss.append(loss)
            # print('after loss data in training', loss.item(), train_loss)

            lossbar.n= loss.item()
            lossbar.refresh()

        print('############# Epoch {}: Training End     #############'.format(epoch))
        training_accuracy = total_accuracy / len(training_loader)
        print(f"training accuracy is {training_accuracy}")
        pbar.close()
        lossbar.close()

        print('############# Epoch {}: Validation Start   #############'.format(epoch))
        ######################  
        # validate the model #
        #####################

        model.eval()

        with torch.no_grad():
            pbar = tqdm(total=num_val_batches, colour='green')
            lossbar = tqdm(total=1.0)
            bw_val_loss = []
            for batch_idx, data in enumerate(validation_loader):
                pbar.update(1)
                ids = data['input_ids'].to(device, dtype = torch.long)
                mask = data['attention_mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask).logits
                
                #_, predicted = torch.max(outputs.data, 1)
                batch_accuracy = accuracy(outputs, targets)
                total_validation_accuracy += batch_accuracy

                loss = loss_fn(outputs, targets)
                bw_val_loss.append(loss)
                
                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

                lossbar.n= loss.item()
                lossbar.refresh()

            print('############# Epoch {}: Validation End     #############'.format(epoch))
            pbar.close()
            lossbar.close()
            # calculate average losses
            #print('before cal avg train loss', train_loss)
            train_loss = sum(bw_train_loss)/len(training_loader)
            valid_loss = sum(bw_val_loss)/len(validation_loader)
            # print training/validation statistics 
            print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
                    epoch, 
                    train_loss,
                    valid_loss
                    ))
            
            validation_accuracy = total_validation_accuracy / len(validation_loader)
            print(f"validation accuracy is {validation_accuracy}")
            
            # create checkpoint variable and add important data
            checkpoint = {
                    'epoch': epoch + 1,
                    'valid_loss_min': valid_loss,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict()
            }
            
            # save checkpoint
            save_ckp(checkpoint, False, checkpoint_path, best_model_path, epoch)
        
            ## TODO: save the model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
                # save checkpoint as best model
                save_ckp(checkpoint, True, checkpoint_path, best_model_path, epoch)
                valid_loss_min = valid_loss

        print('############# Epoch {}  Done   #############\n'.format(epoch))

    return model

if __name__ == '__main__':
    
    df_train = pd.read_csv(f"{training_dataset_pathname}", index_col=False)
    df_val = pd.read_csv(f"{validation_dataset_pathname}", index_col=False)
    
    df_train_v1 = df_train.loc[:, ['crimeaditionalinfo', 'category']]
    df_train_v1 = df_train_v1.reset_index(drop=True)

    df_val_v1 = df_val.loc[:, ['crimeaditionalinfo', 'category']]
    df_val_v1 = df_val_v1.reset_index(drop=True)

    case_tags_train = pd.get_dummies(df_train_v1['category'], dtype=int)
    case_tags_train_columns = sorted(case_tags_train.columns.tolist())
    case_tags_train=case_tags_train[case_tags_train_columns]

    df_train_v2 = pd.concat([df_train_v1, case_tags_train], axis=1)


    case_tags_val = pd.get_dummies(df_val_v1['category'], dtype=int)
    case_tags_val_columns = case_tags_val.columns.tolist()

    # to add missing columns in validation dataframe 
    for column in case_tags_train_columns:
        if column not in case_tags_val_columns:
            case_tags_val[column]=pd.Series(0,index=range(case_tags_val.shape[0]))
    case_tags_val_columns = sorted(case_tags_val.columns.tolist())
    case_tags_val=case_tags_val[case_tags_val_columns]

    df_val_v2 = pd.concat([df_val_v1, case_tags_val], axis=1)

    

In [None]:
# hyperparameters
    MAX_LEN = 256
    # TRAIN_BATCH_SIZE = 8
    # VALID_BATCH_SIZE = 8
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 64
    EPOCHS = 15
    LEARNING_RATE = 1e-05

    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    
    train_dataset = CustomDataset(df_train_v2, tokenizer, MAX_LEN)
    valid_dataset = CustomDataset(df_val_v2, tokenizer, MAX_LEN)

    train_data_loader = torch.utils.data.DataLoader(train_dataset, 
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True,
        num_workers=0
    )

    num_train_batches = len(train_data_loader)

    val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
        batch_size=VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=0
    )

    num_val_batches = len(val_data_loader)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # device = torch.device('cpu')
    
    print(f"target list is {target_list}")

    model = SequenceClassifier(len(target_list))
    model.to(device)
    
    optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)


    val_targets=[]
    val_outputs=[]
    
    Path(CKPT_PATH).mkdir(parents=True, exist_ok=True)
    trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, CKPT_PATH, BEST_MODEL_PATH)
    print(f"saved model to {BEST_MODEL_PATH}")
    
