<a href="https://colab.research.google.com/github/Priyanka-Sachan/Complaint-Identification-using-FL/blob/master/Without_FL/XLnet%2BCNN_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#CNN with XLNet embedding

In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import transformers
transformers.logging.set_verbosity_error()
from transformers import AdamW
from transformers import XLNetModel, XLNetTokenizer
from transformers import get_scheduler
from datasets import load_metric

from tqdm.auto import tqdm
import pandas as pd
import io
import numpy as np
import random

In [None]:
# For Reproducibility
SEED=9
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

##Reading and Pre-processing dataset

In [None]:
# Load the .csv file.
data = pd.read_csv("complaints-data.csv", header=None, names=['id', 'tweet', 'y', 'industry'])

In [None]:
# Tokenize sentences
sentences=data.tweet.values
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
MAX_LEN=49
tokens= [tokenizer(sentence,add_special_tokens=False, padding='max_length',truncation="only_first", max_length=MAX_LEN) for sentence in sentences]

In [None]:
# Get input ids, attention masks and labels.
input_ids=np.asarray([np.asarray(token['input_ids']) for token in tokens])
attention_masks=np.asarray([np.asarray(token['attention_mask']) for token in tokens])
labels=data.y.values

##Model

In [None]:
# Getting xlnet model for word embedding
xlnet=XLNetModel.from_pretrained('xlnet-base-cased')
# Freeze all the parameters
for param in xlnet.parameters():
    param.requires_grad = False

In [None]:
class CNN(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification."""
    def __init__(self,
                 xlnet,
                 embed_dim=768,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.1):
        """
        The constructor for CNN_NLP class.

        Args:
            xlnet (XLNetModel): Returns pretrained embeddings with
                shape (sentence_length, embed_dim)
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN, self).__init__()
        self.embed_dim = embed_dim
        #embedding layer
        self.xlnet = xlnet
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(np.sum(num_filters), num_classes)

    def forward(self, ids,masks):
        """Perform a forward pass through the network.

        Args:
            ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)
            masks (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.xlnet(input_ids=ids,attention_mask=masks)[0].float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],dim=1)
        
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

##Training and Testing functions

In [None]:
# Function to  create dataloader
def create_dataloader(input_ids,masks,labels):

  input_ids=torch.tensor(input_ids)
  masks=torch.tensor(masks)
  labels=torch.tensor(labels)

  data = TensorDataset(input_ids,masks,labels)
  sampler = SequentialSampler(data)
  dataloader = DataLoader(data, sampler=sampler, batch_size=32) 

  return dataloader

In [None]:
#Function to train and validate a model
def train_and_validate_model(learning_rate,train_dataloader,validation_dataloader): 
 
    model = CNN(xlnet)
    model.cuda()

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    # optimizer = optim.Adadelta(model.parameters(),lr=learning_rate,rho=0.95)
    criterion=nn.CrossEntropyLoss()

    num_epochs = 4
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
    early_stopping = EarlyStopping(patience=3, verbose=False)
 
    train_accuracy_metric=load_metric("accuracy")
    valid_accuracy_metric=load_metric("accuracy")
 
    train_loss,valid_loss=0,0
    pr_train_loss,pr_valid_loss=0,0
 
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):

      train_losses = []
      valid_losses = []
 
      model.train()
      for batch in train_dataloader:
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          logits = model(b_input_ids,b_input_mask)
          loss=criterion(logits,b_labels)
          loss.backward()
          optimizer.step()
 
          train_losses.append(loss.item())
          predictions = torch.argmax(logits, dim=-1)
          train_accuracy_metric.add_batch(predictions=predictions,references=b_labels)
 
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
 
      model.eval()
      for batch in validation_dataloader:
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          with torch.no_grad():
              logits = model(b_input_ids,b_input_mask)
          loss=criterion(logits,b_labels)
 
          valid_losses.append(loss.item())
          predictions = torch.argmax(logits, dim=-1)
          valid_accuracy_metric.add_batch(predictions=predictions,references=b_labels)
 
      pr_train_loss=train_loss
      train_loss = np.average(train_losses)
      pr_valid_loss=valid_loss
      valid_loss = np.average(valid_losses)
 
      early_stopping(valid_loss, model)
          
      if early_stopping.early_stop:
          print("Early stopping")
          valid_loss=pr_valid_loss
          train_loss=pr_train_loss
          break

      train_accuracy=train_accuracy_metric.compute()['accuracy']
      valid_accuracy=valid_accuracy_metric.compute()['accuracy']

      print("EPOCH: {}".format(epoch+1),
            "| Train accuracy: {:7.5f}".format(train_accuracy),
            "| Train loss: {:7.5f}".format(train_loss),
            "| Validation accuracy: {:7.5f}".format(valid_accuracy),
            "| Validation loss: {:7.5f}".format(valid_loss))
 
    return train_accuracy,train_loss,valid_accuracy,valid_loss

In [None]:
#Function to train and test a model
def train_and_test_model(learning_rate,train_dataloader,test_dataloader): 
 
    model = CNN(xlnet)
    model.cuda()
 
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    # optimizer = optim.Adadelta(model.parameters(),lr=learning_rate,rho=0.95)
    criterion=nn.CrossEntropyLoss()
    
    num_epochs = 4
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )
 
    train_accuracy_metric=load_metric("accuracy")
    test_accuracy_metric=load_metric("accuracy")
    test_precision_metric=load_metric("precision")
    test_recall_metric=load_metric("recall")
    test_f1_metric=load_metric("f1")

    train_loss,test_loss=0,0
 
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):

      train_losses = []
      model.train()
      for batch in train_dataloader:
          batch = tuple(t.to(device) for t in batch)
          b_input_ids, b_input_mask, b_labels = batch
          logits = model(b_input_ids,b_input_mask)
          loss=criterion(logits,b_labels)
          loss.backward()
          optimizer.step()
 
          train_losses.append(loss.item())
          predictions = torch.argmax(logits, dim=-1)
          train_accuracy_metric.add_batch(predictions=predictions,references=b_labels)
 
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

      train_accuracy=train_accuracy_metric.compute()['accuracy']
      train_loss = np.average(train_losses)
      print("EPOCH: {}".format(epoch+1),
            "| Train accuracy: {:7.5f}".format(train_accuracy),
            "| Train loss: {:7.5f}".format(train_loss))
 
    test_losses = []
    model.eval()
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids,b_input_mask)
        loss=criterion(logits,b_labels)
 
        test_losses.append(loss.item())
        predictions = torch.argmax(logits, dim=-1)
        test_accuracy_metric.add_batch(predictions=predictions,references=b_labels)
        test_precision_metric.add_batch(predictions=predictions,references=b_labels)
        test_recall_metric.add_batch(predictions=predictions,references=b_labels)
        test_f1_metric.add_batch(predictions=predictions,references=b_labels)
 
    test_loss = np.average(test_losses)
    test_accuracy=test_accuracy_metric.compute()['accuracy']
    test_precision=test_precision_metric.compute()['precision']
    test_recall=test_recall_metric.compute()['recall']
    test_f1=test_f1_metric.compute()['f1']

    print("Learning Rate: {}".format(learning_rate)+
        " | Test accuracy: {:7.5f}".format(test_accuracy)+
        " | Test loss: {:7.5f}".format(test_loss)+
        " | Test precision: {:7.5f}".format(test_precision)+
        " | Test recall: {:7.5f}".format(test_recall)+
        " | Test f1 score: {:7.5f}".format(test_f1))
 
    return train_accuracy,train_loss,test_accuracy,test_loss,test_precision,test_recall,test_f1

##Evaluation

In [None]:
# Prepare cross validation
outer_cv = KFold(n_splits=10, shuffle=True, random_state= 1)

In [None]:
train_valid_ids=[]
test_ids=[]
#Split in 10 folds - 9 folds for training+validation set and 1 fold for test set
for train_valid_id, test_id in outer_cv.split(input_ids):
  train_valid_ids.append(train_valid_id)
  test_ids.append(test_id)

In [None]:
# #To save train_valid_set and test_set iterations of outer cross validation
# torch.save(train_valid_ids, 'train_valid_ids.pt')
# torch.save(test_ids, 'test_ids.pt')
# buffer = io.BytesIO()
# torch.save(train_valid_ids, buffer)
# torch.save(test_ids, buffer)

In [None]:
# #Loading train_valid_set and test_set
# with open('train_valid_ids.pt', 'rb') as f:
#   buffer = io.BytesIO(f.read())
# train_valid_ids=torch.load(buffer)

# with open('test_ids.pt', 'rb') as f:
#   buffer = io.BytesIO(f.read())
# test_ids=torch.load(buffer)

###K-fold Cross Valiadtion

In [None]:
# lr=0.001
# # FOR CROSS VALIDATION
# train_accuracy,train_loss,test_loss,test_accuracy,test_recall,test_precision,test_f1=[],[],[],[],[],[],[]

# #Split in 10 folds - 9 folds for training+validation set and 1 fold for test set
# for outer_cv_count in range(10):

#   train_dataloader=create_dataloader(input_ids[train_valid_ids[outer_cv_count]],
#                                          attention_masks[train_valid_ids[outer_cv_count]],
#                                          labels[train_valid_ids[outer_cv_count]])

#   test_dataloader=create_dataloader(input_ids[test_ids[outer_cv_count]],
#                                     attention_masks[test_ids[outer_cv_count]],
#                                     labels[test_ids[outer_cv_count]])

#   tr_accuracy,tr_loss,ts_accuracy,ts_loss,ts_precision,ts_recall,ts_f1=train_and_test_model(lr,train_dataloader,test_dataloader)    

#   train_accuracy.append(tr_accuracy)    
#   train_loss.append(tr_loss)
#   test_accuracy.append(ts_accuracy)
#   test_loss.append(ts_loss)
#   test_precision.append(ts_precision)
#   test_recall.append(ts_recall)
#   test_f1.append(ts_f1)
#   print('----------------------------------------------------------------------------------------------------')

#   torch.cuda.empty_cache() 

In [None]:
# print("Train accuracy:",torch.std_mean(torch.Tensor(train_accuracy)))
# print("Train loss:",torch.std_mean(torch.Tensor(train_loss)))
# print("Test accuracy:",torch.std_mean(torch.Tensor(test_accuracy)))
# print("Test loss:",torch.std_mean(torch.Tensor(test_loss)))
# print("Test precision:",torch.std_mean(torch.Tensor(test_precision)))
# print("Test recall:",torch.std_mean(torch.Tensor(test_recall)))
# print("Test f1:",torch.std_mean(torch.Tensor(test_f1)))

###Nested K-fold Cross Valiadtion

In [None]:
# FOR NESTED CROSS VALIDATION
inner_cv = KFold(n_splits=3, shuffle=True, random_state=1)
train_val_accuracy,train_val_loss,test_loss,test_accuracy,test_recall,test_precision,test_f1=[],[],[],[],[],[],[]
best_learning_rate=[]

#Split in 10 folds - 9 folds for training+validation set and 1 fold for test set
for outer_cv_count in range(10):

  train_val_dataloader=create_dataloader(input_ids[train_valid_ids[outer_cv_count]],
                                         attention_masks[train_valid_ids[outer_cv_count]],
                                         labels[train_valid_ids[outer_cv_count]])

  test_dataloader=create_dataloader(input_ids[test_ids[outer_cv_count]],
                                    attention_masks[test_ids[outer_cv_count]],
                                    labels[test_ids[outer_cv_count]])

  #Check for hyperparameters
  learning_rate=[ 3e-4,5e-4,1e-3,3e-3 ]

  mean_train_loss,mean_train_accuracy=[],[]
  mean_validation_loss,mean_validation_accuracy=[],[]

  #Enumeratig over all hyperparameter combinations
  for i in range(learning_rate.__len__()):

    inner_cv_count=0
    train_loss,train_accuracy=[],[]
    validation_loss,validation_accuracy=[],[]

    #Split in 3 folds - 2 folds for train set and 1 fold for validation
    for train_id, valid_id in inner_cv.split(input_ids[train_valid_ids[outer_cv_count]]):

      train_dataloader=create_dataloader(input_ids[train_id],
                                        attention_masks[train_id],
                                        labels[train_id])
      validation_dataloader=create_dataloader(input_ids[valid_id],
                                        attention_masks[valid_id],
                                        labels[valid_id])

      tr_accuracy,tr_loss,vd_accuracy,vd_loss=train_and_validate_model(learning_rate[i],train_dataloader,validation_dataloader)
      print("MODEL: {}".format(inner_cv_count+1),
            "| Train accuracy: {:7.5f}".format(tr_accuracy),
            "| Train loss: {:7.5f}".format(tr_loss),
            "| Validation accuracy: {:7.5f}".format(vd_accuracy),
            "| Validation loss: {:7.5f}".format(vd_loss))
        
      train_accuracy.append(tr_accuracy)
      train_loss.append(tr_loss)
      validation_accuracy.append(vd_accuracy)
      validation_loss.append(vd_loss)
      inner_cv_count+=1

      torch.cuda.empty_cache() 

    mean_train_accuracy.append(sum(train_accuracy)/3)
    mean_train_loss.append(sum(train_loss)/3)
    mean_validation_accuracy.append(sum(validation_accuracy)/3)
    mean_validation_loss.append(sum(validation_loss)/3)
    
    print("P: "+str(learning_rate[i])+
              " | Mean train accuracy: {:7.5f}".format(mean_train_accuracy[i])+
              " | Mean train loss: {:7.5f}".format(mean_train_loss[i]) +
              " | Mean validation accuracy: {:7.5f}".format(mean_validation_accuracy[i])+
              " | Mean validation loss: {:7.5f}".format(mean_validation_loss[i]))
    print("--------------------------------------------------------------------------------------------------")

  best_learning_rate.append(learning_rate[mean_validation_accuracy.index(max(mean_validation_accuracy))])
  tr_accuracy,tr_loss,ts_accuracy,ts_loss,ts_precision,ts_recall,ts_f1=train_and_test_model(best_learning_rate[outer_cv_count],train_val_dataloader,test_dataloader)

  train_val_accuracy.append(tr_accuracy)    
  train_val_loss.append(tr_loss)
  test_accuracy.append(ts_accuracy)
  test_loss.append(ts_loss)
  test_precision.append(ts_precision)
  test_recall.append(ts_recall)
  test_f1.append(ts_f1)

  print("Learning Rate: {}".format(best_learning_rate[outer_cv_count])+
         " | Train accuracy: {:7.5f}".format(train_val_accuracy[outer_cv_count])+ 
         " | Train loss: {:7.5f}".format(train_val_loss[outer_cv_count]) +
         " | Test accuracy: {:7.5f}".format(test_accuracy[outer_cv_count])+
         " | Test loss: {:7.5f}".format(test_loss[outer_cv_count])+
         " | Test precision: {:7.5f}".format(test_precision[outer_cv_count])+
         " | Test recall: {:7.5f}".format(test_recall[outer_cv_count])+
         " | Test f1 score: {:7.5f}".format(test_f1[outer_cv_count]))
  print('----------------------------------------------------------------------------------------------------')

  torch.cuda.empty_cache() 

In [None]:
print("Train accuracy:",torch.std_mean(torch.Tensor(train_val_accuracy)))
print("Train loss:",torch.std_mean(torch.Tensor(train_val_loss)))
print("Test accuracy:",torch.std_mean(torch.Tensor(test_accuracy)))
print("Test loss:",torch.std_mean(torch.Tensor(test_loss)))
print("Test precision:",torch.std_mean(torch.Tensor(test_precision)))
print("Test recall:",torch.std_mean(torch.Tensor(test_recall)))
print("Test f1:",torch.std_mean(torch.Tensor(test_f1)))

##Result
```
Learning Rate: 0.001  | Train accuracy: 0.82925 | Train loss: 0.38794 | Test accuracy: 0.81739 | Test loss: 0.36239 | Test precision: 0.82979 | Test recall: 0.62400 | Test f1 score: 0.71233
Learning Rate: 0.001  | Train accuracy: 0.82281 | Train loss: 0.39299 | Test accuracy: 0.85217 | Test loss: 0.35149 | Test precision: 0.82474 | Test recall: 0.70175 | Test f1 score: 0.75829
Learning Rate: 0.0005 | Train accuracy: 0.80735 | Train loss: 0.44210 | Test accuracy: 0.85217 | Test loss: 0.36510 | Test precision: 0.84615 | Test recall: 0.67544 | Test f1 score: 0.75122
Learning Rate: 0.001  | Train accuracy: 0.82700 | Train loss: 0.38920 | Test accuracy: 0.80290 | Test loss: 0.40414 | Test precision: 0.88043 | Test recall: 0.58696 | Test f1 score: 0.70435
Learning Rate: 0.0005 | Train accuracy: 0.82152 | Train loss: 0.42620 | Test accuracy: 0.83188 | Test loss: 0.38299 | Test precision: 0.79817 | Test recall: 0.70732 | Test f1 score: 0.75000
Learning Rate: 0.001  | Train accuracy: 0.82893 | Train loss: 0.39580 | Test accuracy: 0.82609 | Test loss: 0.38001 | Test precision: 0.85227 | Test recall: 0.61475 | Test f1 score: 0.71429
Learning Rate: 0.001  | Train accuracy: 0.82023 | Train loss: 0.39942 | Test accuracy: 0.84638 | Test loss: 0.37946 | Test precision: 0.85149 | Test recall: 0.69355 | Test f1 score: 0.76444
Learning Rate: 0.001  | Train accuracy: 0.83537 | Train loss: 0.39628 | Test accuracy: 0.79420 | Test loss: 0.40989 | Test precision: 0.82653 | Test recall: 0.60000 | Test f1 score: 0.69528
Learning Rate: 0.0005 | Train accuracy: 0.81830 | Train loss: 0.42008 | Test accuracy: 0.81739 | Test loss: 0.39898 | Test precision: 0.82292 | Test recall: 0.63200 | Test f1 score: 0.71493
Learning Rate: 0.001  | Train accuracy: 0.81707 | Train loss: 0.39818 | Test accuracy: 0.84884 | Test loss: 0.34167 | Test precision: 0.81250 | Test recall: 0.69643 | Test f1 score: 0.75000
```
Train accuracy: (tensor(0.0079), tensor(0.8228))

Train loss: (tensor(0.0182), tensor(0.4048))

Test accuracy: (tensor(0.0209), tensor(0.8289))

Test loss: (tensor(0.0226), tensor(0.3776))

Test precision: (tensor(0.0235), tensor(0.8345))

Test recall: (tensor(0.0463), tensor(0.6532))

Test f1: (tensor(0.0255), tensor(0.7315))