In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Files
import input_net
import utils

# Packages
from os import path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# Transformers
from transformers import AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer
from transformers import BartConfig, BartForSequenceClassification, BartTokenizer
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer
from transformers import XLMConfig, XLMForSequenceClassification, XLMTokenizer
from transformers import XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from transformers import XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer
from sklearn.model_selection import train_test_split

from transformers import AdamW, get_linear_schedule_with_warmup

Using TensorFlow backend.


In [3]:
args = {
    'model_type': 'bert',
    'do_train': True,
    'do_eval': True,
    'max_seq_length': 70,
    'batch_size': 5, 
    'epochs': 1,
    'learning_rate': 4e-5,
    'num_training_steps': 1000,
    'num_warmup_steps': 100,
    'max_grad_norm': 1.0
}

In [4]:
MODEL_CLASSES = {
    'albert': (AlbertForSequenceClassification, AlbertTokenizer, 'albert-large-v2'),
    'bart': (BartForSequenceClassification, BartTokenizer, 'bart-large'),
    'bert': (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'),
    'camembert': (CamembertForSequenceClassification, CamembertTokenizer, 'camembert-base'),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, 'distilbert-base-uncased'),
    'flaubert': (FlaubertForSequenceClassification, FlaubertTokenizer, 'flaubert-base-uncased'),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, 'xlm-mlm-en-2048'),
    'xlm roberta': (XLMRobertaForSequenceClassification, XLMRobertaTokenizer, 'xlm-roberta-base'),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased')
}

model_class, tokenizer_class, pretrained_model = MODEL_CLASSES[args['model_type']]

In [5]:
TRAIN = "data/train.csv"
TEST = "data/test.csv"
INPUT_NET = 'data/input.csv'

tokenizer = tokenizer_class.from_pretrained(pretrained_model, do_lower_case=True)

if not path.exists(INPUT_NET):
    df = input_net.create_input(TRAIN, INPUT_NET, tokenizer, args['max_seq_length'])
else:
    df = pd.read_csv(INPUT_NET)

In [6]:
def batch_generator(data, target, batch_size):
    data = np.array(data)
    target = np.array(target)
    nsamples = len(data)
    print("data shape", data.shape)
    print("len target", len(target))
    print("nsamples", nsamples)
    perm = np.random.permutation(nsamples)
    for i in range(0, nsamples, batch_size):
        batch_idx = perm[i:i+batch_size]
        print(batch_idx)
        if target is not None:
            #print("data:", data.loc[batch_idx,:])
            yield data[batch_idx,:], target[batch_idx]
        #else:
        #    yield data[batch_idx], None

def training(model, train_data, train_target, batch_size, args):
    criterion = nn.CrossEntropyLoss(reduction='mean')
    optimizer = AdamW(model.parameters(), lr=args['learning_rate'], correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['num_warmup_steps'],
                                                num_training_steps=args['num_training_steps'])
    ncorrect = 0
    for X, y in batch_generator(train_data, train_target, batch_size):
        model.train()
        X_i, X_s, X_p, y = utils.ToTensor(X,y)
        
        #model.zero_grad()
        print(X_i.shape, X_s.shape, X_p.shape)
        out = model(input_ids=X_i, token_type_ids=X_s, attention_mask=X_p, labels=y)[1]
        
        loss = criterion(out, y) 
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])  # Gradient clipping 
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        out = F.softmax(out, dim=1)

        ncorrect += (torch.max(out, 1)[1] == y).sum().item()
        print("train:", loss.item())
    acc = ncorrect/len(train_data) * 100
    return acc, loss
        
#def validation(model, eval_data, batch_size):


    

In [7]:
def build(model, learn_data, batch_size, epochs, args):
    X_train, X_val, y_train, y_val = train_test_split(learn_data.iloc[:,:-1], learn_data.iloc[:,-1],                                                            test_size=0.2, random_state=42)
    train_acc = [None]*epochs
    train_loss = [None]*epochs
    val_acc = [None]*epochs
    val_los = [None]*epochs
    for epoch in range(epochs):
        acc, loss = training(model, X_train, y_train, batch_size, args)
             
        train_acc[epoch] = acc
        train_loss[epoch] = loss
        
        #acc, loss = validation(model, val_data)
        val_acc[epoch] = acc
        val_loss[epoch] = loss
        print(acc,loss)

In [8]:
model = model_class.from_pretrained(pretrained_model, num_labels=2)
print("model loaded")
build(model, df, batch_size = args['batch_size'], epochs = args['epochs'], args=args)

model loaded
data shape (323432, 210)
len target 323432
nsamples 323432
[319181 235731 110790 102605  85824]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 0.2968354821205139
[149969 203843 189113 206610 118151]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 0.9239784479141235
[110909 276938 196154 245802  98115]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 0.7392318248748779
[274079  96312 130126 213554 251797]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 0.6944327354431152
[126920 270204 244563 116988  42804]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 1.2916147708892822
[314139  47743 289357 189450 101149]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 0.9970682263374329
[  4315 250910  90441  26448 230236]
torch.Size([5, 70]) torch.Size([5, 70]) torch.Size([5, 70])
train: 0.8874604105949402
[   385  71579 266565 278637 124635]
torch.Size([5, 70]) torch.Size

RuntimeError: index out of range: Tried to access index 29168 out of table with 28995 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:418

In [9]:
#29361, 28995

In [11]:
print(len(df))

404290


In [15]:
Predictions = model(input_ids=tokens_tensor[:5],token_type_ids=segments_tensor[:5],attention_mask=attention_tensor[:5])

NameError: name 'tokens_tensor' is not defined

In [31]:
F.softmax(model(input_ids=tokens_tensor[:5],token_type_ids=segments_tensor[:5],attention_mask=attention_tensor[:5])[0], dim=1)

tensor([[0.7353, 0.2647],
        [0.7387, 0.2613],
        [0.7226, 0.2774],
        [0.6201, 0.3799],
        [0.7045, 0.2955]], grad_fn=<SoftmaxBackward>)

In [59]:
Predictions[1]

tensor([[-0.3982,  0.6718],
        [-0.4118,  0.6909],
        [-0.3936,  0.6663],
        [-0.4014,  0.6766],
        [-0.4046,  0.6803]], grad_fn=<AddmmBackward>)