In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TextClassificationPipeline

In [2]:
import torch
from torch.utils.data import Dataset   

In [3]:
loaded_tokenizer = AutoTokenizer.from_pretrained("lincoln/flaubert-mlsum-topic-classification")

loaded_model = AutoModelForSequenceClassification.from_pretrained("lincoln/flaubert-mlsum-topic-classification")

In [7]:
inputs = loaded_tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

loaded_model(**inputs,labels = labels)

SequenceClassifierOutput(loss=tensor(3.3437, grad_fn=<NllLossBackward0>), logits=tensor([[ 1.5939, -0.4985, -2.0455,  0.0836, -1.6791, -0.2638,  0.7003,  0.5694,
          1.4755,  0.2995]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
inputs = loaded_tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

In [None]:
nlp = TextClassificationPipeline(model=loaded_model, tokenizer=loaded_tokenizer)

nlp("Le Bayern Munich prend la grenadine.", truncation=True)



In [None]:
outputs = loaded_model(**inputs, labels=labels)

In [None]:
from custom_dataloader import MyCustomDataset

path_to_folder = "/home/sguilluy/Mining-Argument-Structures/data_preprocessing/from_prodigy/files"

train_dataset = MyCustomDataset(path_to_folder)

In [None]:
len(train_dataset)

In [None]:
list_labels = []
for ele in train_dataset:
    splited_sentence = ele.split(".")
    #print(splited_sentence)
    list_results = nlp(splited_sentence, truncation=False)
    for result in list_results:
        list_labels.append(result["label"])

In [None]:
from collections import Counter

count_dict = dict(Counter(list_labels).items())

print(count_dict)

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_flaubert_fine_tunned = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model_flaubert_fine_tunned.to(device)
model_flaubert_fine_tunned.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model_flaubert_fine_tunned.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_flaubert_fine_tunned(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

# Extending Auto class

In [None]:
from transformers import AutoConfig, AutoModel

config_flaubert = AutoConfig.from_pretrained("lincoln/flaubert-mlsum-topic-classification")

AutoConfig.register("config_flaubert", config_flaubert)

AutoModel.register(config_flaubert, model_flaubert_fine_tunned)
