In [None]:
!nvidia-smi -L

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import time
import torch
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
from torch.autograd import Variable
from transformers import XLNetTokenizer, XLNetModel, XLNetConfig
from transformers import XLMRobertaTokenizer, XLMRobertaModel, AdamW, get_linear_schedule_with_warmup

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

In [None]:
data_path = '../input/newsclass01'
output_path = '.'
torch.manual_seed(17)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

In [None]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
xlnet_tokenizer = XLNetTokenizer(tokenizer.vocab_file)
MAX_SEQ_LEN = 512
BATCH_SIZE = 2
PAD_INDEX = xlnet_tokenizer.convert_tokens_to_ids(xlnet_tokenizer.pad_token)
UNK_INDEX = xlnet_tokenizer.convert_tokens_to_ids(xlnet_tokenizer.unk_token)
label_field = Field(sequential=False, use_vocab=False, batch_first=True)
text_field = Field(use_vocab=False, tokenize=xlnet_tokenizer.encode, include_lengths=False,
                   batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = {'titletext' : ('titletext', text_field), 'label' : ('label', label_field)}


# Read preprocessed CSV into TabularDataset and split it into train, test and valid.
dataset = TabularDataset(path=f"{output_path}/prep_news.csv", format='CSV', fields=fields, skip_header=False)
train_data, valid_data, test_data = dataset.split(split_ratio=[0.70, 0.2, 0.1], stratified=True, strata_field='label')

# Create train and validation iterators.
train_iter, valid_iter = BucketIterator.splits((train_data, valid_data), batch_size=BATCH_SIZE, device=device, shuffle=True, sort_key=lambda x: len(x.titletext), sort=True, sort_within_batch=False)
full_iter = BucketIterator(dataset, batch_size=BATCH_SIZE, device=device, shuffle=True, sort_key=lambda x: len(x.titletext), sort=True, sort_within_batch=False)
# Test iterator, no shuffling or sorting required.
test_iter = Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)

In [None]:
df = pd.read_csv(f"{data_path}/train.csv")
df = df.rename(columns={"source": "label"})
df['titletext'] = df['title'] + ". " + df['text']
df['titletext'] = df['titletext'].apply(lambda x: " ".join(x.split()[:512]))
df.to_csv(f"{output_path}/prep_news.csv")

In [None]:
pd.read_csv(f"{output_path}/prep_news.csv").head(100).to_csv(f"{output_path}/news1.csv")

In [None]:
class XLNetClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(XLNetClassifier, self).__init__()
        config = XLNetConfig(vocab_size=xlnet_tokenizer.vocab_size, output_path=os.path.join(output_path, "output"))
        self.xlnet = XLNetModel(config)
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(1024, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 7)
        
    def forward(self, input_ids, attention_mask):
        x = self.xlnet(input_ids=input_ids, attention_mask=attention_mask)
        x = torch.mean(x['last_hidden_state'], 1)
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        x = self.d2(x)
        x = self.l2(x)
        return x  

In [None]:
import time
def eval_step(model, source, target):
    mask = (source != PAD_INDEX).type(torch.uint8)
    y_pred = model(input_ids=source, attention_mask=mask)
    return torch.nn.CrossEntropyLoss()(y_pred, target)

def train_step(model, source, target, optimizer, scheduler=None):
    loss = eval_step(model, source, target)
    loss.backward()
    optimizer.step()
    if scheduler is not None:
        scheduler.step()
    optimizer.zero_grad()
    return loss

def epoch_eval(iterator):
    epoch_start = time.time()
    model.eval()
    cumulative_loss = 0.0
    for index, ((source, target), _) in enumerate(iterator):
        cumulative_loss += eval_step(model, source, target).item()
        epoch_time_left = time.time() - epoch_start
        print(f'\rVal {index}/{len(iterator)} {int(epoch_time_left)}/{int(epoch_time_left / (index + 1) * len(iterator))}s loss: {cumulative_loss / (index + 1)}', end = '')
    print(f'Val {int(time.time() - epoch_start)}s')
    return cumulative_loss / len(iterator)

def epoch_train(iterator, optimizer, scheduler=None):
    epoch_start = time.time()
    model.train()
    cumulative_loss = 0.0
    for index, ((source, target), _) in enumerate(iterator):
        cumulative_loss += train_step(model, source, target, optimizer, scheduler).item()
        epoch_time_left = time.time() - epoch_start
        print(f'\rTrain {index}/{len(iterator)} {int(epoch_time_left)}/{int(epoch_time_left / (index + 1) * len(iterator))}s loss: {cumulative_loss / (index + 1)}', end = '')
    print(f'\rTrain {int(time.time() - epoch_start)}s')
    return cumulative_loss / len(iterator)

def train_fully(model, optimizer, train_iter, valid_iter, scheduler=None, num_epochs=5,
                valid_period=len(train_iter), output_path=output_path):
    best_valid_loss = float('Inf')

    for epoch in range(num_epochs):
        epoch_start = time.time()
        train_loss = epoch_train(train_iter, optimizer, scheduler)
        val_loss = epoch_eval(valid_iter)
        print('Epoch', f'{epoch + 1}/{num_epochs}', 'Train Loss:', train_loss, 'Val Loss:', val_loss, 'for', f'{int(time.time() - epoch_start)}s')
        if best_valid_loss > val_loss:
            best_valid_loss = val_loss
            torch.save(model, output_path + f'/model_{int(time.time())}.pth')

def train(model, optimizer, train_iter, valid_iter, scheduler=None, num_epochs=5,
          valid_period=len(train_iter), output_path=output_path, pretrain=False):
    if pretrain:
        for param in model.roberta.parameters():
            param.requires_grad = False

    train_fully(model, optimizer, train_iter, valid_iter, scheduler, num_epochs, valid_period, output_path)
    
    if pretrain:
        for param in model.roberta.parameters():
            param.requires_grad = True
        print('Pre-training done!')
    else:
        print('Training done!')

def production_train(model, optimizer, full_iter, scheduler=None, num_epochs=5,
                     valid_period=len(train_iter), output_path=output_path, pretrain=False):
    for epoch in range(num_epochs):
        epoch_start = time.time()
        train_loss = epoch_train(full_iter, optimizer, scheduler)
        print('Epoch', f'{epoch + 1}/{num_epochs}', 'Train Loss:', train_loss, 'for', f'{int(time.time() - epoch_start)}s')
        torch.save(model, output_path + f'/model_{int(time.time())}.pth')

In [None]:
model = torch.load('../input/news-text-classification-model-1617170880/model_1617170880.pth')
steps_per_epoch = len(full_iter)
NUM_EPOCHS = 4

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*1, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)

print("======================= Start production training ==============================")

production_train(model=model, full_iter=full_iter, optimizer=optimizer, scheduler=scheduler, num_epochs=NUM_EPOCHS, pretrain=True)

In [None]:
import gc
gc.collect()
torch.cuda.memory_allocated()

In [None]:
del model
del optimizer
del scheduler

In [None]:
del mksdwadadad

In [None]:
def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for index, ((source, target), _) in enumerate(test_loader):
            print('\rEvaluating', f'{index}/{len(test_loader)}', end='')
            mask = (source != PAD_INDEX).type(torch.uint8)
            output = model(source, attention_mask=mask)
            y_pred.extend(torch.argmax(output, axis=-1).tolist())
            y_true.extend(target.tolist())
    
    print('Classification Report:')
    print('f1_macro:', f1_score(y_true, y_pred, average='macro'))
    print(classification_report(y_true, y_pred))
    
    cm = confusion_matrix(y_true, y_pred)
    ax = plt.subplot()

    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

In [None]:
evaluate(model, test_iter)

In [None]:
model = torch.load('./model_1617170880.pth')
evaluate(model, test_iter)

In [None]:
model = torch.load('./model_1617165611.pth')
evaluate(model, test_iter)

In [None]:
def form_submission():
    test_fields = {'titletext' : ('titletext', text_field)}
    test_dataset = TabularDataset(path=f"{data_path}/prepared_test.csv", format='CSV', fields=test_fields, skip_header=False)
    test_iterator = Iterator(test_dataset, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for index, (source, _) in enumerate(test_iterator):
            print('\rPredicting', f'{index}/{len(test_iterator)}', end='')
            mask = (source != PAD_INDEX).type(torch.uint8)
            output = model(source, attention_mask=mask)
            y_pred.extend(torch.argmax(output, axis=-1).tolist())
    submission = pd.DataFrame({"Predicted": y_pred}, index=test_df["Id"]).rename_axis("Id")
    submission.to_csv(f"{output_path}/submission_{time.time()}.csv")
    # submission["label"] = y_pred
    # submission['titletext'] = test_df['titletext']
    # submission.to_csv(f"{data_path}/submission_verifier.csv", index=False)
    # verifier_dataset = TabularDataset(path=f"{data_path}/submission_verifier.csv", format='CSV', fields=fields, skip_header=False)
    # verifier_iterator = Iterator(verifier_dataset, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)
    # evaluate(model, verifier_iterator)
form_submission()