In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/dbpedia-classes/DBPEDIA_train.csv
/kaggle/input/dbpedia-classes/DBPEDIA_test.csv
/kaggle/input/dbpedia-classes/DBPEDIA_val.csv
/kaggle/input/dbpedia-classes/DBP_wiki_data.csv


In [3]:
import os
import math

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,WeightedRandomSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig,XLNetForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

Using TensorFlow backend.


In [4]:
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score,accuracy_score

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
data = pd.read_csv("/kaggle/input/dbpedia-classes/DBP_wiki_data.csv")
#train = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dbpedia-classes/DBPEDIA_train.csv")
#validation = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dbpedia-classes/DBPEDIA_val.csv")
#test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dbpedia-classes/DBPEDIA_test.csv")

In [7]:
le = LabelEncoder()
le.fit(data["l1"])
data["target"] = le.transform(data['l1'])

In [8]:
device

'cuda'

In [9]:
train , test = train_test_split(data,test_size=0.25,shuffle=True,random_state=42,stratify=data["target"])
train , val = train_test_split(train,test_size=0.1,shuffle=True,random_state=42,stratify=train["target"])

In [10]:
print(len(train),len(test),len(val))
print("Train ",Counter(train['target']))
print("Test ",Counter(test['target']))
print("Validation ",Counter(val['target']))

231376 85696 25709
Train  Counter({0: 119704, 3: 43961, 4: 21026, 8: 20137, 2: 18265, 5: 5607, 7: 1686, 6: 752, 1: 238})
Test  Counter({0: 44336, 3: 16282, 4: 7787, 8: 7458, 2: 6765, 5: 2077, 7: 624, 6: 279, 1: 88})
Validation  Counter({0: 13301, 3: 4885, 4: 2336, 8: 2237, 2: 2029, 5: 623, 7: 187, 6: 84, 1: 27})


In [11]:
#config = XLNetConfig()
#config.from_pretrained = 'xlnet-large-cased'
#config.output_hidden_states = True
#config.output_attentions = False
#config.summary_type = 'mean'

In [12]:
#tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
#model = XLNetForSequenceClassification(config)

In [13]:
#model.config

In [14]:
#Input_ids = tokenizer.encode_plus("Hello, my dog is cute", add_special_tokens=True) # Batch size 1

In [15]:
#input_ids , atten_mask , token_type_ids, labels  = torch.tensor(Input_ids["input_ids"]).unsqueeze(0), torch.tensor(Input_ids["attention_mask"]).unsqueeze(0),torch.tensor(Input_ids["token_type_ids"]).unsqueeze(0),torch.tensor([1]).unsqueeze(0) 

In [16]:
#outputs = model(input_ids=input_ids,attention_mask=atten_mask,token_type_ids=token_type_ids,labels=labels)

In [17]:
#len(outputs)

In [18]:
#outputs[2][0].shape

# Starting Model Building From Here !

In [19]:
def CreateData(tokenizer,data):
    inp_ids = []
    tok_type_ids = []
    atten_mask  = []
    labels = []
    for i in range(len(data)):
      text = data.iloc[i]["text"]
      temp = tokenizer.encode_plus(text,max_length=100,pad_to_max_length = True)
      inp_ids.append(temp["input_ids"])
      tok_type_ids.append(temp["token_type_ids"])
      atten_mask.append(temp["attention_mask"])
      labels.append([data.iloc[i]["target"]])
    
    input_ids = torch.tensor(inp_ids,dtype=torch.long)
    attention_mask = torch.tensor(atten_mask,dtype=torch.long)
    token_type_ids = torch.tensor(tok_type_ids,dtype=torch.long)
    labels = torch.tensor(labels,dtype=torch.long)

    dataset = TensorDataset(input_ids, attention_mask,token_type_ids,labels)
    return dataset

In [20]:
def make_weight(data):
  #data = data.copy()
  counter = Counter(data["target"].values)
  print(counter)
  data["Weight"] = data["target"].apply(lambda x:counter[x])
  data["Weight"] = 1.0 / data["Weight"]
  return data

In [0]:
#data_v = make_weight(train)

## Model Building with Dev Set

In [51]:
def train_engine(train_data,val_data,model,batch_sz,lr,epochs,device):

  tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
  train_data = make_weight(train_data)
  train_dataset = CreateData(tokenizer,train_data)
  val_dataset = CreateData(tokenizer,val_data)
  #pickle.dump(open("/content/drive/My Drive/Colab Notebooks/train_dataset_tokenised.p",train_dataset))
  #pickle.dump(open("/content/drive/My Drive/Colab Notebooks/val_dataset_tokenised.p",val_dataset))
  optimizer = AdamW(model.parameters(), lr=lr, weight_decay=1e-1, correct_bias=False)
  #scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)
  weight_sampler = WeightedRandomSampler(weights=train_data["Weight"].values,num_samples=len(train_data["Weight"].values),replacement=True)
  train_loader = DataLoader(train_dataset,batch_size=batch_sz,sampler=weight_sampler)
  train_loss = []
  val_loss = []
  best_loss = math.inf
  for epoch in range(epochs):
    epoch_train_loss = 0
    model.train()
    for batch_id,batch in enumerate(train_loader):
      if batch_id % 7000 == 0:
          print(epoch,batch_id)
      optimizer.zero_grad()
      inputs = {"input_ids": batch[0].to(device), "attention_mask": batch[1].to(device),  "token_type_ids": batch[2].to(device),"labels": batch[3].to(device)}
      #lb = inputs["labels"].squeeze(1).numpy().tolist()
      #print(Counter(lb),len(Counter(lb)))
      loss , logits = model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'],token_type_ids=inputs['token_type_ids'],labels=inputs['labels'])
      _ , out_preds = torch.max(logits,axis=1)
      epoch_train_loss = epoch_train_loss + loss.item()
      loss.backward()
      optimizer.step()

    epoch_train_loss = epoch_train_loss / (1.0 * len(train_loader))
    train_loss.append(epoch_train_loss)
    model.eval()
    with torch.no_grad():
      y_true_val = []
      y_pred_val = []
      epoch_val_loss = 0.0
      epoch_val_acc = 0.0 
      val_loader = DataLoader(val_dataset,batch_size=32)
      for batch_id,batch in enumerate(val_loader):
        inputs = {"input_ids": batch[0].to(device), "attention_mask": batch[1].to(device),  "token_type_ids": batch[2].to(device),"labels": batch[3].to(device)}
        loss , logits = model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'],token_type_ids=inputs['token_type_ids'],labels=inputs['labels'])
        epoch_val_loss = epoch_val_loss + loss.item()
        _ , out_preds = torch.max(logits,axis=1)
        epoch_val_acc =  epoch_val_acc + torch.eq(out_preds,inputs['labels'].squeeze(1)).sum().item()
        #print("Validation id ",batch_id,batch[3].size(),torch.eq(out_preds,inputs['labels']).sum().item())
        y_pred_val.extend(out_preds.detach().cpu().numpy().tolist())
        y_true_val.extend(inputs["labels"].squeeze(1).detach().cpu().numpy().tolist())

      epoch_val_loss = epoch_val_loss / (len(val_loader)*1.0)
      epoch_val_acc = epoch_val_acc / len(val_data)
      val_loss.append(epoch_val_loss)
      if best_loss > epoch_val_loss :
        best_loss = epoch_val_loss
        torch.save({
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict':optimizer.state_dict(),
                'loss':best_loss,},'/kaggle/saved_modelv1.pth')
        
    target_name = list(le.classes_)
    print("*****************************************************************")
    print("Validation Report")
    print("*****************************************************************")
    print(classification_report(y_true_val,y_pred_val,target_names=target_name))
    print("*****************************************************************")
  
    print(epoch,train_loss[-1],val_loss[-1],epoch_val_acc,f1_score(y_true_val,y_pred_val,average='weighted'),accuracy_score(y_true_val,y_pred_val))  
        
  return model          

## Tester Code

In [52]:
def test_engine(model,test_data):
  tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
  preds_all = []
  true_all = []
  test_loss = 0.0
  test_acc = 0.0 
  test_dataset = CreateData(tokenizer,test_data)
  model.eval()
  with torch.no_grad():
      test_loader = DataLoader(test_dataset,batch_size=32)
      for batch_id,batch in enumerate(test_loader):
        inputs = {"input_ids": batch[0].to(device), "attention_mask": batch[1].to(device),  "token_type_ids": batch[2].to(device),"labels": batch[3].to(device)}
        loss , logits = model(input_ids=inputs['input_ids'],attention_mask=inputs['attention_mask'],token_type_ids=inputs['token_type_ids'],labels=inputs['labels'])
        test_loss = test_loss + loss.item()
        _ , out_preds = torch.max(logits,axis=1)
        test_acc =  test_acc + torch.eq(out_preds,inputs['labels'].squeeze(1)).sum().item()
        preds_all.extend(out_preds.detach().cpu().numpy().tolist())
        true_all.extend(inputs['labels'].squeeze(1).detach().cpu().numpy().tolist())

      test_acc = test_acc / (len(test_data)*1.0)
      tes_loss = test_loss / (1.0*len(test_loader))
  return preds_all,true_all,test_loss,test_acc

### Check Sequence Length for Choosing Seq Length

In [0]:
'''
seq_len = []
for i in range(len(train)):
  text = train.iloc[i]["text"]
  temp = tokenizer.encode_plus(text)
  seq_len.append(len(temp["input_ids"]))

print(np.mean(seq_len))
'''

## Define the Model

In [30]:
class XLNetClassifier(torch.nn.Module):
  def __init__(self,labels):
    super(XLNetClassifier,self).__init__()
    #self.num_labels = labels
    #self.config = config
    self.xlnet_encoder = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',max_length=100,output_hidden_states=True,summary_type = "mean",_num_labels=labels)
  
  def forward(self,input_ids,token_type_ids,attention_mask,labels):
    out = self.xlnet_encoder(input_ids=input_ids,token_type_ids=token_type_ids,
                             attention_mask=attention_mask,labels=labels)
    loss , logits = out[:2]
    return (loss,logits)

## Starter Code

In [54]:
def main():
  BATCH_SIZE = 16
  LR = 2e-5
  EPOCHS = 1
  num_labels = len(Counter(train.target.values))
  model = XLNetClassifier(num_labels)
  #print(model.config)
  model = train_engine(train,val,model.to(device),BATCH_SIZE,LR,EPOCHS,device)
  model_best = XLNetClassifier(num_labels)
  checkpoint = torch.load("/kaggle/saved_modelv1.pth")
  model_best.load_state_dict(checkpoint["model_state_dict"])
  model_best.to(device)
  preds_all,true_all,test_loss,test_acc = test_engine(model_best,test)
  target_name = list(le.classes_)
  print(test_acc)
  print(classification_report(true_all,preds_all,target_names=target_name))
  pickle.dump(true_all,open("Test_True.p",'wb'))
  pickle.dump(preds_all,open("Test_Preds.p",'wb'))

In [55]:
torch.cuda.empty_cache()


In [56]:
main()

0.9913181478715459
                precision    recall  f1-score   support

         Agent       1.00      0.99      0.99     44336
        Device       0.96      1.00      0.98        88
         Event       0.98      1.00      0.99      6765
         Place       0.99      0.99      0.99     16282
       Species       1.00      1.00      1.00      7787
  SportsSeason       0.96      1.00      0.98      2077
TopicalConcept       0.96      0.97      0.96       279
    UnitOfWork       1.00      1.00      1.00       624
          Work       0.99      0.99      0.99      7458

      accuracy                           0.99     85696
     macro avg       0.98      0.99      0.99     85696
  weighted avg       0.99      0.99      0.99     85696



In [0]:
# Some Basic Tests
z = torch.tensor([[1,4],[3,7],[2,5]])
z1 = torch.tensor([1,1,0])
z = z.to(device)
_ , preds = torch.max(z,axis=1)
print(z)
print(preds)
print(preds.detach().cpu())
print(preds)
torch.eq(preds,z1.to(device)).sum().item()

In [0]:

1 2.218397746202519 1.7787151649263537 0.11122064440159676 2.2414355885923207 0.2325870646766169 0.05242849427672362

In [0]:
config = XLNetConfig()
config.from_pretrained = 'xlnet-base-cased'
config.output_hidden_states = True
config.output_attentions = False
config.summary_type = 'mean'
#config.n_layer = 12
#config.n_head = 8
config._num_labels = len(Counter(train.target.values))
model = XLNetClassifier()
model.xlnet_encoder.config.output_hidden_states = True
model.xlnet_encoder.config.summary_type = "mean"

In [0]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [0]:
Input_ids = tokenizer.encode_plus("Hello, my dog is cute", add_special_tokens=True)  # Batch size 1
input_ids , atten_mask , token_type_ids, labels  = torch.tensor(Input_ids["input_ids"]).unsqueeze(0), torch.tensor(Input_ids["attention_mask"]).unsqueeze(0),torch.tensor(Input_ids["token_type_ids"]).unsqueeze(0),torch.tensor([1]).unsqueeze(0) 

In [0]:
outputs = model(input_ids=input_ids,attention_mask=atten_mask,token_type_ids=token_type_ids,labels=labels)

In [0]:
for z in model.xlnet_encoder.modules():
  print(z)

In [41]:
t1= torch.tensor([0,1,3,2])
t2 = torch.tensor([1,0,3,2])

In [42]:
t2 = t2.view(4,-1)

In [45]:
t2.shape

torch.Size([4, 1])

In [47]:
torch.eq(t1,t2.squeeze(1))

tensor([False, False,  True,  True])

In [0]:
val_dataset = CreateData(tokenizer,val)
val_loader = DataLoader(val_dataset,batch_size=32)

In [0]:
(torch.tensor([0,1,1,0,1],dtype=torch.long).sum()/5.0).item()