In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [41]:
# Files
import input_net
import utils

# Packages
from os import path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer
from transformers import BartConfig, BartForSequenceClassification, BartTokenizer
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
from transformers import CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer
from transformers import DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer
from transformers import XLMConfig, XLMForSequenceClassification, XLMTokenizer
from transformers import XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from transformers import XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer
from sklearn.model_selection import train_test_split

# Optimizer -> busca info de com s'utilitza
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
TRAIN = "data/train.csv"
TEST = "data/test.csv"
INPUT_NET = 'data/input.csv'

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_len = 70

if not path.exists(INPUT_NET):
    df = input_net.create_input(TRAIN, INPUT_NET, tokenizer, max_len)
else:
    df = pd.read_csv(INPUT_NET)

tokens_tensor, segments_tensor, attention_tensor, targets_tensor = utils.ToTensor(df)

404290 404290 404290


In [45]:
MODEL_CLASSES = {
    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
    'bart': (BartConfig, BartForSequenceClassification, BartTokenizer),
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
    'flaubert': (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'xlm roberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

# Mirar aixo:: AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)

In [None]:
args = {
    'model_type': 'xlm',
    'model_name': 'bert-base-cased',
    'do_train': True,
    'do_eval': True,
    'max_seq_length': 70,
    'batch_size': 16, 

    'num_train_epochs': 4,
    'weight_decay': 0,
    'learning_rate': 4e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,
}

In [46]:
'''
tokenizer = tokenizer_class.from_pretrained(args['model_name'])

model = model_class.from_pretrained(args['model_name'])

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
'''

In [29]:
df.columns = ["I"+str(i) for i in range(70)] + ["S"+str(i) for i in range(70)] + ["P"+str(i) for i in range(70)] + ["is_duplicate"]
#df

In [30]:
df

Unnamed: 0,I0,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,P61,P62,P63,P64,P65,P66,P67,P68,P69,is_duplicate
0,101,2054,2003,1996,3357,2011,3357,5009,2000,15697,...,0,0,0,0,0,0,0,0,0,0
1,101,2054,2003,1996,2466,1997,12849,10606,16506,1006,...,0,0,0,0,0,0,0,0,0,0
2,101,2129,2064,1045,3623,1996,3177,1997,2026,4274,...,0,0,0,0,0,0,0,0,0,0
3,101,2339,2572,1045,10597,2200,9479,1029,2129,2064,...,0,0,0,0,0,0,0,0,0,0
4,101,2029,2028,21969,1999,2300,21864,2243,2135,5699,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,101,2129,2116,3145,22104,2024,2045,1999,1996,14513,...,0,0,0,0,0,0,0,0,0,0
404286,101,2079,2017,2903,2045,2003,2166,2044,2331,1029,...,0,0,0,0,0,0,0,0,0,1
404287,101,2054,2003,2028,9226,1029,102,2054,1005,1055,...,0,0,0,0,0,0,0,0,0,0
404288,101,2054,2003,1996,22480,3296,3465,1997,2542,2096,...,0,0,0,0,0,0,0,0,0,0


In [44]:
criterion = nn.CrossEntropyLoss(reduction='sum')

def batch_generator(data, target, batch_size):
    nsamples = len(data)
    perm = np.random.permutation(nsamples)

    for i in range(0, nsamples, batch_size):
        batch_idx = perm[i:i+batch_size]
        if target is not None:
            yield data[batch_idx], target[batch_idx]
        #else:
        #    yield data[batch_idx], None

def train(model, X, y, batch_size, optimizer):
    model.train()
    for batch in batch_generator(train_data, train_target, batch_size):
        X = torch.tensor(X, dtype=torch.long, device=device) #al codi original X era un numpy.darray, aqui un pd.df
        y = torch.tensor(y, dtype=torch.long, device=device)
        
        model.zero_grad() #pred = model(input_ids=tokens_tensor[i_min:i_max],token_type_ids=segments_tensor[i_min:i_max],attention_mask=attention_tensor[i_min:i_max], labels=)
        out = model(X)
        loss = criterion(out, y) #no se si aixo ha de ser diferent
        loss.backward()
        optimizer.step()
        
def validation(model, X, y, batch_size):
    model.eval()
    

In [43]:
def build(model, learn_data, optimizer, batch_size, epochs):
    X_train, X_val, y_train, y_val = train_test_split(learn_data.iloc[:,:-1], learn_data[:,-1], test_size=0.2, random_state=42)
    
    train_acc = [None]*epochs
    train_loss = [None]*epochs
    val_acc = [None]*epochs
    val_los = [None]*epochs
    for epoch in range(epochs):
        acc, loss = train(model, X_train, y_train, batch_size, optimizer)
        train_acc[epoch] = acc
        train_loss[epoch] = loss
        
        acc, loss = val(model, val_data)
        val_acc[epoch] = acc
        val_loss[epoch] = loss
        

In [45]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
#optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) #comentat pq no se on va

#hyp to optimise
batch_size = 16#,32
epochs = 4

build(model, df, optimizer, batch_size, epochs)

NameError: name 'optimizer' is not defined

In [11]:
Predictions = model(input_ids=tokens_tensor[:5],token_type_ids=segments_tensor[:5],attention_mask=attention_tensor[:5])

In [12]:
F.softmax(model(input_ids=tokens_tensor[:5],token_type_ids=segments_tensor[:5],attention_mask=attention_tensor[:5])[0], dim=1)

tensor([[0.2488, 0.7512],
        [0.2490, 0.7510],
        [0.2397, 0.7603],
        [0.2346, 0.7654],
        [0.2338, 0.7662]], grad_fn=<SoftmaxBackward>)