# Imports

In [2]:
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import torch
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW, RMSprop, Adam
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [2]:
os.chdir('..') #change directory from \notebooks to \Cluster_Obfuscation
os.getcwd()

'c:\\Users\\sadai\\Desktop\\Cluster_Obfuscation'

# Load Processed Data

In [None]:
train_input_ids = torch.load("./data/author_data/processed_data/X_train_ids.pt")
valid_input_ids = torch.load("./data/author_data/processed_data/X_test_ids.pt")
train_att_masks = torch.load("./data/author_data/processed_data/X_train_masks.pt")
valid_att_masks = torch.load( "./data/author_data/processed_data/X_test_masks.pt")
train_labels = np.load("./data/author_data/processed_data/y_train.npy")
valid_labels = np.load("./data/author_data/processed_data/y_test.npy")

  train_input_ids = torch.load("./data/processed_data/X_train_ids.pt")
  valid_input_ids = torch.load("./data/processed_data/X_test_ids.pt")
  train_att_masks = torch.load("./data/processed_data/X_train_masks.pt")
  valid_att_masks = torch.load( "./data/processed_data/X_test_masks.pt")


# DataLoader

In [20]:
BATCH_SIZE = 25

In [42]:
train_dataset = TensorDataset(train_input_ids, train_att_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_labels)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Define Model

In [22]:
PRETRAINED_LM = "distilbert-base-uncased"#"albert-base-v2"#"bert-base-uncased"
DROPOUT_RATE = 0.1

In [31]:
from torch import nn
N_labels = len(train_labels.unique())#len(train_data['Author'].unique())#len(df_balanced['id'].unique())#len(new_df['id'].unique())
model = DistilBertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
model.dropout = nn.Dropout(DROPOUT_RATE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


# Finetune Model

## Optimizer and Scheduler

In [43]:
EPOCHS = 30
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
optimizer = Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = get_linear_schedule_with_warmup(optimizer, 
             num_warmup_steps=0,
             num_training_steps=len(train_dataloader)*EPOCHS )

## Training

In [45]:
train_loss_per_epoch = []
val_loss_per_epoch = []
best_val_loss = float('inf')

train_accuracy_list = []
valid_accuracy_list = []

# # Load saved model state dict
# checkpoint = torch.load(r'distilbert_finetuned_c50_tfidf\distilbert_finetuned_tfidf_cluster_35.pt')
# #1e-6 -> 1e-5
# model.load_state_dict(checkpoint['model_state_dict'])

# # Load saved optimizer state dict if available
# if 'optimizer_state_dict' in checkpoint:
#     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)
    '''
    Training
    '''
    model.train()
    train_loss = 0
    train_pred = []
    train_label = []
    for step_num, batch_data in enumerate(tqdm(train_dataloader,desc='Training')):
        train_input_ids, train_att_masks, train_labels = [data.to(device) for data in batch_data]
        # input_embeds=model.get_input_embeddings().weight[input_ids].clone()

        output = model(input_ids = input_ids, attention_mask=att_mask, labels=labels)
        # inputs_embeds = new_model.embeddings(input_ids=input_ids, attention_mask=att_mask)
        # output = model(attention_mask=att_mask, inputs_embeds=input_embeds, labels= labels)
        loss = output.loss
        train_loss += loss.item()
        
        train_pred.append(np.argmax(output.logits.detach().cpu().numpy(),axis=-1))
        train_label.append(labels.detach().cpu().numpy())
        model.zero_grad()
        loss.backward()
        del loss

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_loss_per_epoch.append(train_loss / (step_num + 1))              
    train_pred = np.concatenate(train_pred)
    train_label = np.concatenate(train_label)
    train_accuracy_per_epoch = (train_pred == train_label).mean()
    train_accuracy_list.append(train_accuracy_per_epoch)
    
    '''
    Validation
    '''
    model.eval()
    valid_loss = 0
    valid_pred = []
    valid_label = []
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(tqdm(valid_dataloader,desc='Validation')):
            input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            # input_embeds=model.get_input_embeddings().weight[input_ids].clone()
            output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)
            # output = model(attention_mask=att_mask, inputs_embeds=input_embeds, labels= labels)
            loss = output.loss
            valid_loss += loss.item()
   
            valid_pred.append(np.argmax(output.logits.detach().cpu().numpy(),axis=-1))
            valid_label.append(labels.detach().cpu().numpy())
        
    val_loss_per_epoch.append(valid_loss / (step_num_e + 1))
    valid_pred = np.concatenate(valid_pred)
    valid_label = np.concatenate(valid_label)
    valid_accuracy_per_epoch = (valid_pred == valid_label).mean()
    valid_accuracy_list.append(valid_accuracy_per_epoch)

    #Early Stopping
    if (valid_loss / (step_num_e + 1)) < best_val_loss:
        # torch.save(model.state_dict(), f'best_model_epoch{epoch_num+1}.pt')
        best_val_loss = valid_loss
    else:
        break

    # save model checkpoint every 5th epoch
    if (epoch_num + 1) % 5 == 0:
        checkpoint = {
            'epoch': epoch_num + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': valid_loss,
            'val_accuracy': valid_pred,
        }
        torch.save(checkpoint, fr'model\finetuned_distilbert\distilbert_finetuned{epoch_num+1}.pt')

    '''
    Loss message
    '''
    
    print("{0}/{1} train loss: {2} ".format(step_num+1, math.ceil(len(train_labels) / BATCH_SIZE), train_loss / (step_num + 1)))
    print("{0}/{1} val loss: {2} ".format(step_num_e+1, math.ceil(len(valid_labels) / BATCH_SIZE), valid_loss / (step_num_e + 1)))
    # print(f"{step_num+1}/{math.ceil(len(train_dataloader))} train loss: {train_loss}".format(step_num+1, math.ceil(len(train_dataloader)) / BATCH_SIZE), train_loss / (step_num + 1))
    # print(f"{step_num_e+1}/{math.ceil(len(valid_dataloader))} val loss: {valid_loss} ")#.format(step_num_e+1, math.ceil(len(valid_dataloader)) / BATCH_SIZE), valid_loss / (step_num_e + 1))


Epoch:  1


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.017621688230428846 
40/40 val loss: 5.271534303692169 
Epoch:  2


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.016660808300366627 
40/40 val loss: 5.309915825701319 
Epoch:  3


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.015538801770890131 
40/40 val loss: 5.33518597939983 
Epoch:  4


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01563669549068436 
40/40 val loss: 5.5215535294963045 
Epoch:  5


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.013820185838267208 
40/40 val loss: 5.4765576787292956 
Epoch:  6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.013300611864542589 
40/40 val loss: 5.642829957581125 
Epoch:  7


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.012832185288425535 
40/40 val loss: 5.582112095667981 
Epoch:  8


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.012649036513175815 
40/40 val loss: 5.568297352362424 
Epoch:  9


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.012257605395279824 
40/40 val loss: 5.7710916404379535 
Epoch:  10


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.012220962526043877 
40/40 val loss: 5.414376278640702 
Epoch:  11


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.012091565987793729 
40/40 val loss: 5.6413539995905015 
Epoch:  12


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011780765419825912 
40/40 val loss: 5.653073392133228 
Epoch:  13


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011856292741140351 
40/40 val loss: 5.5025217404589055 
Epoch:  14


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011243576504057273 
40/40 val loss: 5.816465759510175 
Epoch:  15


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01130569419474341 
40/40 val loss: 5.806815108028241 
Epoch:  16


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011227602389408276 
40/40 val loss: 6.043868085276335 
Epoch:  17


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011379652563482523 
40/40 val loss: 5.898421565326862 
Epoch:  18


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011266583256656304 
40/40 val loss: 5.914569980325178 
Epoch:  19


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011075919674476608 
40/40 val loss: 5.9246435492066665 
Epoch:  20


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011302106326911598 
40/40 val loss: 5.932538953749463 
Epoch:  21


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.010772544279461727 
40/40 val loss: 5.981716122664511 
Epoch:  22


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01082147886045277 
40/40 val loss: 5.847567402850837 
Epoch:  23


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.010808451258344575 
40/40 val loss: 5.892991319182329 
Epoch:  24


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.010514052578946576 
40/40 val loss: 5.812806095834821 
Epoch:  25


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01082374369725585 
40/40 val loss: 5.7964139093644915 
Epoch:  26


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011054858402349054 
40/40 val loss: 5.7964139093644915 
Epoch:  27


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01117986609460786 
40/40 val loss: 5.7964139093644915 
Epoch:  28


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01071586002362892 
40/40 val loss: 5.7964139093644915 
Epoch:  29


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.01065378631465137 
40/40 val loss: 5.7964139093644915 
Epoch:  30


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Validation:   0%|          | 0/40 [00:00<?, ?it/s]

160/1 train loss: 0.011364021565532312 
40/40 val loss: 5.7964139093644915 


# Performance 