In [1]:
import re
from  tqdm import tqdm

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup, AdamW, AutoTokenizer
from sklearn.metrics import accuracy_score 


from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Thera are  %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Thera are  1 GPU(s) available.


In [3]:

# model_name = "microsoft/deberta-v2-xlarge"
# model_name = "microsoft/deberta-v3-base"
model_name = "microsoft/deberta-v3-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=3,
                                                          output_attentions = False,
                                                            output_hidden_states = False,).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 64
tokenizer.model_max_length = max_length

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
df = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')

In [5]:
def normalise(text):
#     text = text.lower()
#     text = text.strip()
#     text = re.sub("\n", " ", text)
    return text

In [6]:
df["essay_text"] = df["essay_id"].apply(lambda x: open(f'../input/feedback-prize-effectiveness/train/{x}.txt').read())

df['discourse_type'] = df['discourse_type'].apply(normalise)
df['discourse_text'] = df['discourse_text'].apply(normalise)
df['essay_text'] = df['essay_text'].apply(normalise)

df['text_features'] =  df['discourse_type'] + tokenizer.sep_token + df['discourse_text'] + tokenizer.sep_token + df['essay_text']
df.drop(['discourse_id', 'essay_id', 'essay_text', 'discourse_text', 'discourse_type'], axis=1, inplace=True )

In [7]:
classes_to_labels = {
    "Adequate":1,
    "Effective":2,
    "Ineffective":0,
}
df['discourse_effectiveness'] = df['discourse_effectiveness'].map(classes_to_labels)


In [8]:
# w_adequate = 1-len(df[df['discourse_effectiveness'] == 'adequate'])/len(df)
# w_effective = 1-len(df[df['discourse_effectiveness'] == 'effective'])/len(df)
# w_ineffective = 1-len(df[df['discourse_effectiveness'] == 'ineffective'])/len(df)
w_adequate = 1
w_effective = 1
w_ineffective = 1

In [9]:
df.head()

Unnamed: 0,discourse_effectiveness,text_features
0,1,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,1,"Position[SEP]On my perspective, I think that t..."
2,1,Claim[SEP]I think that the face is a natural l...
3,1,"Evidence[SEP]If life was on Mars, we would kno..."
4,1,Counterclaim[SEP]People thought that the face ...


In [10]:
sentences = df.text_features.values
labels = df.discourse_effectiveness.values

In [11]:
del df

In [12]:
# def Token_example( list_of_texts = None, index = 0):
#     if list_of_texts is None:
#         list_of_texts=['It is he' + str(tokenizer.sep_token) +'He is labrador. His name is Zeus ' ]
#     print('Original:', list_of_texts[index])
#     print()
#     print('With Token:', tokenizer.tokenize(list_of_texts[index]))
#     print()
#     print('With Token IDs:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(list_of_texts[index])))
#     print()
# Token_example(sentences)

In [13]:


input_ids = []
attention_masks = []
for sent in tqdm(sentences):
    encoded_dict = tokenizer.encode_plus(
        sent, 
        add_special_tokens = True,
        max_length = max_length,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
y = torch.tensor(labels)

  0%|          | 0/36765 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 36765/36765 [01:39<00:00, 368.95it/s]


In [14]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

2022-08-06 20:48:22.067956: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-06 20:48:22.075734: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 20:48:22.076876: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 20:48:22.077549: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 8022217707440271652,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14983888896
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 862300317275464188
 physical_device_desc: "device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0"]

In [15]:
dataset = TensorDataset(input_ids, attention_masks, y)
size_train = 0.95
train_size = int( size_train * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


In [16]:
batch_size = 40
train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size
)

test_dataloader = DataLoader(
    test_dataset,
    sampler = SequentialSampler(test_dataset),
    batch_size = batch_size
)


In [17]:
optimizer = AdamW( model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
)



In [18]:
epochs = 2

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup( optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps= total_steps)

In [19]:
import random
import numpy as np


seed_val = 42

random.seed(42)

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [20]:


# class_weights = torch.tensor(
#     [w_adequate, w_effective, w_ineffective]
# ).to(device)

# loss_fct = nn.CrossEntropyLoss(weight=class_weights)

In [32]:
training_stats = []

for epoch_i in (range(1, epochs + 1)):
    total_train_loss = 0
    model.train()
    
    
    for batch in tqdm(train_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()
        
        
        res = model(b_input_ids,
                   token_type_ids=None,
                   attention_mask=b_input_mask,
                   labels = b_labels)
            
        loss= res['loss']
        logits = res['logits']
        
        total_train_loss += loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        optimizer.step()
        
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    print(f'avg_train_loss:{avg_train_loss}')
    model.eval()
    
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    list_of_logits = None
    list_of_label_ids = None
    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            res = model(b_input_ids,
                   token_type_ids=None,
                   attention_mask=b_input_mask,
                   labels = b_labels) 
            
            loss= res['loss']
            logits = res['logits']
             
        total_eval_loss += loss.item()


#         logits = logits.detach().to('cpu').numpy()
#         label_ids = b_labels.to('cpu').numpy()
        
        logits = logits.detach().to('cpu')
        label_ids = b_labels.to('cpu')



        if list_of_logits is None:
            list_of_logits = logits
            list_of_label_ids = label_ids
        else:
            list_of_logits = torch.cat((list_of_logits, logits))
            list_of_label_ids = torch.cat((list_of_label_ids, label_ids))
        
#         idx = label_ids
#         pred_true_labels = list_of_logits[torch.arange(len(idx)), idx]
    pred  = torch.Tensor(list_of_logits).argmax(dim=1).to('cpu')

    acc = accuracy_score(list_of_label_ids, pred)
    avg_val_loss = total_eval_loss / len(test_dataloader)

    training_stats.append({
        'epoch': epoch_i,
        'training_loss': avg_train_loss,
        'valid.loss': avg_val_loss,
        'valid.accuracy': acc,})
    print(training_stats[-1])
    

100%|██████████| 874/874 [05:12<00:00,  2.80it/s]


avg_train_loss:0.5912910996162646
{'epoch': 1, 'training_loss': 0.5912910996162646, 'valid.loss': 0.6629241614238076, 'valid.accuracy': 0.7052746057640021}


100%|██████████| 874/874 [05:12<00:00,  2.80it/s]


avg_train_loss:0.5921527829950407
{'epoch': 2, 'training_loss': 0.5921527829950407, 'valid.loss': 0.6629241614238076, 'valid.accuracy': 0.7052746057640021}


In [33]:
model.save_pretrained("model_Deberta_V3_large_lower_weigth.h5")
tokenizer.save_pretrained("BertDebrta_V3_large_lower_weigth.h5")

('BertDebrta_V3_large_lower_weigth.h5/tokenizer_config.json',
 'BertDebrta_V3_large_lower_weigth.h5/special_tokens_map.json',
 'BertDebrta_V3_large_lower_weigth.h5/spm.model',
 'BertDebrta_V3_large_lower_weigth.h5/added_tokens.json',
 'BertDebrta_V3_large_lower_weigth.h5/tokenizer.json')

In [23]:
df = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [24]:
df["essay_text"] = df["essay_id"].apply(lambda x: open(f'../input/feedback-prize-effectiveness/test/{x}.txt').read())

df['discourse_type'] = df['discourse_type'].apply(normalise)
df['discourse_text'] = df['discourse_text'].apply(normalise)
df['essay_text'] = df['essay_text'].apply(normalise)

df['text_features'] =  df['discourse_type'] + tokenizer.sep_token + df['discourse_text'] + tokenizer.sep_token + df['essay_text']
df.drop(['discourse_id', 'essay_id', 'essay_text', 'discourse_text', 'discourse_type'], axis=1, inplace=True )

In [25]:
sentences = df.text_features.values
del df

In [26]:


input_ids = []
attention_masks = []
for sent in tqdm(sentences):
    encoded_dict = tokenizer.encode_plus(
        sent, 
        add_special_tokens = True,
        max_length = max_length,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

100%|██████████| 10/10 [00:00<00:00, 282.03it/s]


In [27]:
dataset = TensorDataset(input_ids, attention_masks)

In [28]:
batch_size = 36

prediction_dataloader = DataLoader(
    dataset,
    sampler = SequentialSampler(dataset),
    batch_size = batch_size
)

In [29]:
model.eval()
predictions = None
m = nn.Softmax(dim=1)
for batch in prediction_dataloader:
    b_inputs_ids, b_input_mask = batch
    b_inputs_ids = b_inputs_ids.to(device)
    b_input_mask = b_input_mask.to(device)
    with torch.no_grad():
        outputs = model(b_inputs_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
        outputs = m(outputs.logits)
    if predictions == None:
        predictions = outputs
    else:

        predictions = torch.cat((predictions, outputs))
predictions = predictions.to('cpu')

In [30]:
predictions

tensor([[0.0108, 0.3510, 0.6382],
        [0.0143, 0.5378, 0.4479],
        [0.0044, 0.2698, 0.7258],
        [0.0074, 0.3230, 0.6696],
        [0.0123, 0.4910, 0.4967],
        [0.0030, 0.1448, 0.8522],
        [0.0069, 0.2467, 0.7465],
        [0.0083, 0.4240, 0.5676],
        [0.0031, 0.1533, 0.8436],
        [0.0113, 0.3947, 0.5939]])

In [31]:
submit = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')
submission = pd.DataFrame({'discourse_id':submit['discourse_id'],'Adequate':predictions[:,1],'Effective':predictions[:,2],'Ineffective':predictions[:,0]})
submission.to_csv("/kaggle/working/submission.csv",index = False)