In [1]:
import re
import gc
from tqdm.notebook import tqdm
import random
import time 
import pandas as pd
import numpy as np 
import psutil

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import get_cosine_schedule_with_warmup, AdamW, AutoTokenizer



from transformers import  AutoModelForSequenceClassification

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss



# seed_val = 177
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Thera are  %d GPU(s) available.' % torch.cuda.device_count())
    print(torch.cuda.get_device_name(device=None))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [3]:

# model_name = "microsoft/deberta-v2-xlarge"
# model_name = "microsoft/deberta-v3-base"
model_name = "microsoft/deberta-v3-base"
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=3,
                                                          output_attentions = False,
                                                            output_hidden_states = False,).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 128
tokenizer.model_max_length = max_length

In [4]:
df = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
# df = df.sample(df.shape[0] // 10)

In [5]:
def normalise(text):
#     text = text.lower()
#     text = text.strip()
    text = re.sub("\n", " ", text)
    return text

In [6]:
df["essay_text"] = df["essay_id"].apply(lambda x: open(f'../input/feedback-prize-effectiveness/train/{x}.txt').read())
df['discourse_type'] = df['discourse_type'].apply(normalise)
df['discourse_text'] = df['discourse_text'].apply(normalise)
df['essay_text'] = df['essay_text'].apply(normalise)
df['text_features'] =  df['discourse_type'] + tokenizer.sep_token + df['discourse_text'] + tokenizer.sep_token + df['essay_text']
df.drop(['discourse_id', 'essay_id', 'essay_text', 'discourse_text', 'discourse_type'], axis=1, inplace=True )

In [7]:
classes_to_labels = {
    "Adequate":1,
    "Effective":2,
    "Ineffective":0,
}
df['discourse_effectiveness'] = df['discourse_effectiveness'].map(classes_to_labels)


In [8]:
df.head()

In [9]:
sentences = df.text_features.values
labels = df.discourse_effectiveness.values

In [10]:
input_ids = []
attention_masks = []
for sent in tqdm(sentences):
    encoded_dict = tokenizer.encode_plus(
        sent, 
        add_special_tokens = True,
        max_length = max_length,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
y = torch.tensor(labels)

In [11]:
k_fold = 4
random_state = 177
Stratified_KF = StratifiedKFold(n_splits=k_fold, shuffle=True)
batch_size = 10


In [12]:
epochs = 1
soft_max = nn.Softmax(dim=1)



In [13]:
# def Token_example( list_of_texts = None, index = 0):
#     if list_of_texts is None:
#         list_of_texts=['It is he' + str(tokenizer.sep_token) +'He is labrador.    His name is Zeus ' ]
#     print('Original:', list_of_texts[index])
#     print()
#     print('With Token:', tokenizer.tokenize(list_of_texts[index]))
#     print()
#     print('With Token IDs:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(list_of_texts[index])))
#     print()
# Token_example()

In [14]:

print(psutil.virtual_memory()) # Check free-memory
for epoch_i in (range(1, epochs + 1)):
    

    print(f'----------------------------epoch:{epoch_i}/{epochs}-------------------------')
    
    
    
    for fold, (train_id, val_id), in tqdm(enumerate(Stratified_KF.split(X=input_ids, y=labels))):
        model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=3,
                                                          output_attentions = False,
                                                            output_hidden_states = False,).to(device)
        len_traindata = len(df) // k_fold * (k_fold - 1) // batch_size
        optimizer = AdamW( model.parameters(),
                          lr = 2e-5,
                          eps = 1e-8)

        total_steps = len_traindata * epochs * k_fold 
        scheduler = get_cosine_schedule_with_warmup( optimizer,
                                                    num_warmup_steps=0,
                                                    num_training_steps= total_steps)
        total_train_loss = 0
        total_eval_loss = 0
        val_logloss = 0
        start_time = time.time()
        
        input_ids_train_fold, input_ids_valid_fold = input_ids[train_id], input_ids[val_id]
        attention_masks_train_fold, attention_masks_valid_fold =  attention_masks[train_id],attention_masks[val_id]
        y_train_fold, y_valid_fold = torch.Tensor(labels[train_id]), torch.Tensor(labels[val_id])

        train_dataset = TensorDataset(input_ids_train_fold, attention_masks_train_fold, y_train_fold)
        valid_dataset = TensorDataset(input_ids_valid_fold, attention_masks_valid_fold, y_valid_fold)

        train_dataloader = DataLoader(
        train_dataset,
        sampler = RandomSampler(train_dataset),
        batch_size = batch_size
        )

        valid_dataloader = DataLoader(
        valid_dataset,
        sampler = SequentialSampler(valid_dataset),
        batch_size = batch_size
        )
        
        model.train()
        
        for batch in tqdm(train_dataloader):
        
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            optimizer.zero_grad()
            
            res = model.forward(b_input_ids,
                               token_type_ids=None,
                               attention_mask=b_input_mask,
                               labels = b_labels)
            
            loss_value = res['loss']
            total_train_loss += loss_value
            loss_value.backward()
            
#             torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            del batch, res, loss_value
            gc.collect()
            
            
        list_of_logits = None
        list_of_labels = None
        model.eval()
        for batch in valid_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            with torch.no_grad():
                res = model.forward(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels = b_labels)
            
                loss_value = res['loss']
                total_eval_loss += loss_value
        
                
                logits = soft_max(res['logits']).to('cpu').numpy()
                b_labels = b_labels.to('cpu').numpy()
                if list_of_logits is None:
                    list_of_logits = logits
                    list_of_labels = b_labels
                else:
                    list_of_logits = np.append(list_of_logits, logits, axis=0)
                    list_of_labels = np.append(list_of_labels, b_labels, axis=0)
                    
                del batch, res
                gc.collect()
            
            
            
        avg_train_loss = total_train_loss / len(train_dataset)
        avg_val_loss = total_eval_loss / len(valid_dataset)
        log_loss_val = log_loss(list_of_labels, list_of_logits)
        memory = psutil.virtual_memory().percent

         
        f_time = time.time() - start_time
        print(f'epoch: {epoch_i}, path: {fold+1}/{k_fold}, used_memory:{memory}%, time:{f_time:.2f}s, log_loss:{log_loss_val:.5f} '+
              f'loss_train:{avg_train_loss:.5f}, loss_valid:{avg_val_loss:.5f}')

        gc.collect()
            
        model.save_pretrained(f"model_Deberta_base128k{fold}.h5")
        del model
        gc.collect()

In [15]:
tokenizer.save_pretrained("token_model_Deberta_base128.h5")

In [16]:
df = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

In [17]:
df["essay_text"] = df["essay_id"].apply(lambda x: open(f'../input/feedback-prize-effectiveness/test/{x}.txt').read())

df['discourse_type'] = df['discourse_type'].apply(normalise)
df['discourse_text'] = df['discourse_text'].apply(normalise)
df['essay_text'] = df['essay_text'].apply(normalise)

df['text_features'] =  df['discourse_type'] + tokenizer.sep_token + df['discourse_text'] + tokenizer.sep_token + df['essay_text']
df.drop(['discourse_id', 'essay_id', 'essay_text', 'discourse_text', 'discourse_type'], axis=1, inplace=True )

In [18]:
sentences = df.text_features.values
del df

In [19]:


input_ids = []
attention_masks = []
for sent in tqdm(sentences):
    encoded_dict = tokenizer.encode_plus(
        sent, 
        add_special_tokens = True,
        max_length = max_length,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [20]:
dataset = TensorDataset(input_ids, attention_masks)

In [21]:
batch_size = 32

prediction_dataloader = DataLoader(
    dataset,
    sampler = SequentialSampler(dataset),
    batch_size = batch_size
)

In [22]:
model.eval()
predictions = None
m = nn.Softmax(dim=1)
for batch in prediction_dataloader:
    b_inputs_ids, b_input_mask = batch
    b_inputs_ids = b_inputs_ids.to(device)
    b_input_mask = b_input_mask.to(device)
    with torch.no_grad():
        outputs = model(b_inputs_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
        outputs = m(outputs.logits)
    if predictions == None:
        predictions = outputs
    else:

        predictions = torch.cat((predictions, outputs))
predictions = predictions.to('cpu')

In [None]:
predictions

In [None]:
submit = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')
submission = pd.DataFrame({'discourse_id':submit['discourse_id'],'Adequate':predictions[:,1],'Effective':predictions[:,2],'Ineffective':predictions[:,0]})
submission.to_csv("/kaggle/working/submission.csv",index = False)