## Imports

In [1]:
import sys
sys.path.append('../input/torchcontrib/contrib-master/')
import torchcontrib

In [2]:
import os
import torch
import wandb
import pandas as pd
import numpy as np

from kaggle_secrets import UserSecretsClient
from transformers import AdamW, AutoTokenizer,AutoConfig,AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from commonlit_nn_kit import get_scheduler, train_and_evaluate, Saver, RobertaLastHiddenStateRegressor,RobertaPoolerOutputRegressor,\
                            RobertaMaskedLastHiddenStateRegressor,RobertaMaskFilledAttentionHeadRegressor,\
                            RobertaMaskAddedAttentionHeadRegressor,RobertaLastHiddenStateMeanPooler
from commonlit_nn_kit import compute_rmse_loss, compute_rmse_score, UnoStacker, get_optimizer_parameters,compute_mse_loss
from commonlit_nn_kit import clear_cuda, seed_everything, forward_pass_uno_text_batch,create_uno_text_dataloader,train_and_evaluate_swa

from torch.utils.data import WeightedRandomSampler,DataLoader,Dataset


In [3]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

## Configs 

In [1]:
train_on_full_data = False

config = {'train_on_sample': False,
          'num_epochs': 3,
          'batch_size': 16,
          'random_state': 1000,
          'tokenizer_name': '../input/clrp-roberta-base/clrp_roberta_base',
          'base_model': '../input/clrp-roberta-base/clrp_roberta_base',
          'lr': 4e-5,
          'group_mode': 'be_wd',
          'scheduler_type': 'cosine_schedule_with_warmup',
          'max_length': 248,
          'accumulation_steps': 1,
          'validate_every_n_iteraion': 1000 if train_on_full_data else 10,
          'should_save_final_model':train_on_full_data,
          'pretrained_on_external_data': False,
          'multiplicative_factor':0.97,
          'swa_freq':10,
          'swa':True,
          'external_data': 'sample_wiki_abstracts',
         'train_on_full_data':train_on_full_data,
         'should_save_best_valid_loss_model':False if train_on_full_data else True,
         'should_save_best_valid_score_model':False if train_on_full_data else True,
          'model_class':["RobertaLastHiddenStateRegressor","RobertaLastHiddenStateMeanPooler"]
         }


if config['accumulation_steps']>1:
    
    config['lr'] = config['lr']*config['accumulation_steps']

if config['train_on_sample']:
    config['sample_size'] = 25
    config['valid_size'] = 15
    config['test_size'] = 7
    config['batch_size'] = 4
    config['validate_every_n_iteraion'] = 1
    config['scorer_cv'] = 2



# Load Data

In [6]:
data = pd.read_csv('../input/commonlit-splits/commonlittrain_stratified_simple.csv')



In [7]:
def get_fold_stats(data):
    
    dic = {'target mean':data['target'].mean(),'target std':data['target'].std(),
           'error mean':data['standard_error'].mean(),'error std':data['standard_error'].std(),
          'mean count words':data['excerpt'].str.split().apply(lambda x : len(x)).mean(),
          'std count words': data['excerpt'].str.split().apply(lambda x : len(x)).std(),
          'url legal':data['url_legal'].isna().sum()}
    df = pd.DataFrame(dic,index=['whole data'])    
   
    
    for fold in range(5):
        
        train = data[data['fold']==fold]
        valid = data[data['fold']!=fold]

        
        dic = {'target mean':train['target'].mean(),'target std':train['target'].std(),
           'error mean':train['standard_error'].mean(),'error std':train['standard_error'].std(),
               
         'valid target mean':valid['target'].mean(),'valid target std':valid['target'].std(),
           'valid error mean':valid['standard_error'].mean(),'valid error std':valid['standard_error'].std(),
            
           'mean count words':train['excerpt'].str.split().apply(lambda x : len(x)).mean(),
          'std count words': train['excerpt'].str.split().apply(lambda x : len(x)).std(),
               
            'valid mean count words':valid['excerpt'].str.split().apply(lambda x : len(x)).mean(),
          'valid std count words': valid['excerpt'].str.split().apply(lambda x : len(x)).std(),
               
          'url legal':train['url_legal'].isna().sum(),
          'valid url legal':valid['url_legal'].isna().sum(),
        }
        
        dic = pd.DataFrame(dic,index=[f'fold{fold}']) 
        df = df.append(dic)
        
        
    return df
        
        
        

    

In [8]:
get_fold_stats(data)

# Start the training party🎉 

In [9]:
%%capture
clear_cuda()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [11]:
from collections import defaultdict

In [12]:
SEED = 1000

In [2]:

for exp,model_class in enumerate(config['model_class']):
    
    valids = defaultdict(list)
    for fold in range(5):

            seed_everything(seed=config['random_state']+fold)

            tokenizer = AutoTokenizer.from_pretrained(os.path.join(config['tokenizer_name']))
            model = eval(model_class)(os.path.join(config['base_model']),head_hidden_dim = 512,dropout_prob=0.0,roberta_hidden_dropout_prob=0.0,layer_norm_eps=1e-7)
            _ = model.to(device)

            parameters = get_optimizer_parameters(group_mode=config['group_mode'], lr=config['lr'], model=model,
            multiplicative_factor=config['multiplicative_factor'],weight_decay=0.01,train_pooler=False)
            
            optimizer = AdamW(params=parameters)
            optimizer = torchcontrib.optim.SWA(optimizer)

            if config['train_on_full_data']:
                data = data.sample(frac=1, random_state=config['random_state'] + fold)
                train_data, valid_data = data[:-2], data[-2:]
            else:
                print(bcolors.OKBLUE + f"starting fold -> {fold}")
                train_data = data[data['fold']!=fold]
                train_data = train_data[(train_data['standard_error']!=0) & (train_data['target']!=0)].reset_index(drop=True)
                valid_data = data[data['fold']==fold]
                valid_data = valid_data[(valid_data['standard_error']!=0) & (valid_data['target']!=0)].reset_index(drop=True)

            print(bcolors.OKCYAN)
            valid_loss_saver = Saver(metric_name='rmse_loss', is_lower_better=True, config=config, save_name=f'{exp}-rmse_loss_fold_{fold}', should_save=config['should_save_best_valid_loss_model'])
            valid_score_saver = Saver(metric_name='rmse_score', is_lower_better=True, config=config, save_name=f'{exp}-rmse_score_fold_{fold}', should_save=config['should_save_best_valid_score_model'])
            final_model_saver = Saver(metric_name='final_model', is_lower_better=True, config=config, save_name=f'{exp}-full_data_fold_{fold}', should_save=config['should_save_final_model'])

            #train_sampler = WeightedRandomSampler(weights=train_data['standard_error'].fillna(train_data['standard_error'].mean()).values,replacement=True,num_samples=len(train_data))
            #valid_sampler = WeightedRandomSampler(weights=1/valid_data['standard_error'].fillna(valid_data['standard_error'].mean()).values,replacement=True,num_samples=len(train_data))

            train_dataloader = create_uno_text_dataloader(data=train_data, batch_size=config['batch_size'], shuffle=True,sampler=None,apply_preprocessing=False)
            valid_dataloader = create_uno_text_dataloader(data=valid_data, batch_size=config['batch_size'], shuffle=False, sampler=None,apply_preprocessing=False)

            scheduler = get_scheduler(scheduler_type=config['scheduler_type'], optimizer=optimizer,
                                      num_warmup_steps=50,num_training_steps = config['num_epochs']*len(train_dataloader))

            print(f'Number of batches in the train data: {len(train_dataloader)}')
            print(f'Number of batches in the valid data: {len(valid_dataloader)}')

            fold_valid = train_and_evaluate_swa(num_epochs=config['num_epochs'], train_dataloader=train_dataloader, valid_dataloader=valid_dataloader, tokenizer=tokenizer,
                                   model=model, optimizer=optimizer, scheduler=scheduler,
                                   forward_pass_fn_train=forward_pass_uno_text_batch, forward_pass_fn_valid=forward_pass_uno_text_batch,
                                   compute_loss_fn_train=compute_mse_loss, compute_loss_fn_valid=compute_rmse_loss,
                                   compute_metric_fn=compute_rmse_score, stacker_class=UnoStacker,
                                   max_length=config['max_length'], accumulation_steps=config['accumulation_steps'],
                                   validate_every_n_iteraion=config['validate_every_n_iteraion'], valid_loss_saver=valid_loss_saver,
                                   valid_score_saver=valid_score_saver,final_model_saver=final_model_saver,swa_freq=config['swa_freq'],device=device)

            print(bcolors.OKBLUE + f"best valid score = {fold_valid['best_score']}\n")
            print(bcolors.OKBLUE + f"best valid loss = {fold_valid['best_loss']}\n\n")


            valids['best_score'].append(fold_valid['best_score'])
            valids['best_loss'].append(fold_valid['best_loss'])

            clear_cuda()






    print("Mean Valid Score",np.mean(valids['best_score']))
    print("Mean Valid Loss",np.mean(valids['best_loss']))