In [1]:
from baseline_transformer_POS_conv import *
import copy

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())  # Renvoie True si un GPU est disponible

batch_size=16
epochs=100

True


In [3]:
# Load data using the load_data_1 function
sentences, pos_tags = load_data_1("UD_French-Sequoia/fr_sequoia-ud-train.conllu")

# Create character and tag mappings
char_counts = Counter(char for sentence in sentences for word in sentence for char in word)
char_to_ix = {char: i for i, char in enumerate(char_counts, start=2)}
char_to_ix['<PAD>'], char_to_ix['<UNK>'] = 0, 1  # Padding and unknown character

tag_counts = Counter(tag for tags in pos_tags for tag in tags)
tag_to_ix = {tag: i for i, tag in enumerate(tag_counts)}

max_word_len = max(len(word) for sentence in sentences for word in sentence)

# Now load the data in the desired format using the load_data function
train_sentences, train_pos_tags = load_data("UD_French-Sequoia/fr_sequoia-ud-train.conllu", char_to_ix, max_word_len)
validation_sentences, validation_pos_tags = load_data("UD_French-Sequoia/fr_sequoia-ud-dev.conllu", char_to_ix, max_word_len)

# Rest of your code for Dataset, DataLoader, Model initialization, etc.

# Dataset and DataLoader
dataset = POSDataset(train_sentences, train_pos_tags, tag_to_ix, max_word_len,char_to_ix)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

validation_dataset = POSDataset(validation_sentences, validation_pos_tags, tag_to_ix, max_word_len,char_to_ix)
validation_data_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [4]:

# Model initialization
num_chars = len(char_to_ix)
char_embedding_dim = 512
num_filters = 512
kernel_size = 20
nhead = 4
nhid = 1024
nlayers = 3
tagset_size = len(tag_to_ix)

model = POSTransformerModel(num_chars, char_embedding_dim, num_filters, kernel_size, nhead, nhid, nlayers, tagset_size)

# Loss and Optimizer
loss_function = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.SGD(model.parameters(), lr=0.01)





In [5]:

patience = 5  # Nombre d'époques à attendre après la dernière amélioration de la loss de validation
best_val_accuracy = 0
epochs_no_improve = 0

#Training
for epoch in range(epochs): 
    model.train()
    model.to(device)  # Déplacer le modèle sur le GPU si disponible
    total_loss = 0
    for sentence_in, targets in data_loader:
        sentence_in, targets = sentence_in.to(device), targets.to(device)  # Déplacer les données sur le périphérique
        optimizer.zero_grad()
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores.view(-1, len(tag_to_ix)), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Utiliser la fonction modifiée pour évaluer la validation loss et l'accuracy
    val_loss, val_accuracy = evaluate_model(model, validation_data_loader, loss_function,device, tag_to_ix)
    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = copy.deepcopy(model)
        print(f"best model accuracy: {best_val_accuracy:.4f}")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    # Arrêt précoce si aucune amélioration
    if epochs_no_improve == patience:
        print("Arrêt précoce : La loss de validation ne s'améliore plus")
        break
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(data_loader)}")


Epoch 1, Validation Loss: 1.0391, Validation Accuracy: 0.7038
best model accuracy: 0.7038
Epoch 1, Loss: 1.5551999041012354
Epoch 2, Validation Loss: 0.5470, Validation Accuracy: 0.8327
best model accuracy: 0.8327
Epoch 2, Loss: 0.741947934457234
Epoch 3, Validation Loss: 0.5262, Validation Accuracy: 0.8364
best model accuracy: 0.8364
Epoch 3, Loss: 0.5565359190106391
Epoch 4, Validation Loss: 0.4154, Validation Accuracy: 0.8729
best model accuracy: 0.8729
Epoch 4, Loss: 0.4598973872406142
Epoch 5, Validation Loss: 0.3919, Validation Accuracy: 0.8815
best model accuracy: 0.8815
Epoch 5, Loss: 0.39572088250092097
Epoch 6, Validation Loss: 0.3224, Validation Accuracy: 0.9000
best model accuracy: 0.9000
Epoch 6, Loss: 0.35176483988761903
Epoch 7, Validation Loss: 0.4571, Validation Accuracy: 0.8625
Epoch 7, Loss: 0.31519896537065506
Epoch 8, Validation Loss: 0.2656, Validation Accuracy: 0.9207
best model accuracy: 0.9207
Epoch 8, Loss: 0.28615368349211556
Epoch 9, Validation Loss: 0.2683,

KeyboardInterrupt: 

In [None]:
model.eval()
with torch.no_grad():
    # Convert the first sentence in the dataset to character indices
    char_indices = [[char_to_ix.get(char, char_to_ix['<UNK>']) for char in word] for word in sentences[0]]
    char_indices = [word[:max_word_len] + [char_to_ix['<PAD>']] * (max_word_len - len(word)) for word in char_indices]

    # Convert to tensor and add batch dimension
    inputs = torch.tensor(char_indices, dtype=torch.long).unsqueeze(0).to(device)

    # Get tag scores from the model
    tag_scores = model(inputs)
    predicted_tags = [list(tag_to_ix.keys())[tag] for tag in tag_scores[0].argmax(dim=1).cpu()]
    
    print(f"Sentence: {' '.join(sentences[0])}")
    print(f"Predicted POS Tags: {predicted_tags}")
    true_tags = [tag for tag in pos_tags[0]]
    print(f"Vraies étiquettes POS: {true_tags}")



test_sentences, test_pos_tags = load_data("UD_French-Sequoia/fr_sequoia-ud-test.conllu", char_to_ix, max_word_len)


test_dataset = POSDataset(test_sentences, test_pos_tags, tag_to_ix, max_word_len,char_to_ix)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Calculer l'accuracy
loss,accuracy = evaluate_model(model,test_data_loader,loss_function,device,tag_to_ix)
print(f"Test Accuracy : {accuracy:.4f}")

Sentence: gutenberg
Predicted POS Tags: ['PROPN']
Vraies étiquettes POS: ['PROPN']
Test Accuracy : 0.9167


In [None]:
best_model.eval()
with torch.no_grad():
    # Convert the first sentence in the dataset to character indices
    char_indices = [[char_to_ix.get(char, char_to_ix['<UNK>']) for char in word] for word in sentences[0]]
    char_indices = [word[:max_word_len] + [char_to_ix['<PAD>']] * (max_word_len - len(word)) for word in char_indices]

    # Convert to tensor and add batch dimension
    inputs = torch.tensor(char_indices, dtype=torch.long).unsqueeze(0).to(device)

    # Get tag scores from the model
    tag_scores = best_model(inputs)
    predicted_tags = [list(tag_to_ix.keys())[tag] for tag in tag_scores[0].argmax(dim=1).cpu()]
    
    print(f"Sentence: {' '.join(sentences[0])}")
    print(f"Predicted POS Tags: {predicted_tags}")
    true_tags = [tag for tag in pos_tags[0]]
    print(f"Vraies étiquettes POS: {true_tags}")



test_sentences, test_pos_tags = load_data("UD_French-Sequoia/fr_sequoia-ud-test.conllu", char_to_ix, max_word_len)


test_dataset = POSDataset(test_sentences, test_pos_tags, tag_to_ix, max_word_len,char_to_ix)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Calculer l'accuracy
loss,accuracy = evaluate_model(best_model,test_data_loader,loss_function,device,tag_to_ix)
print(f"Test Accuracy : {accuracy:.4f}")

Sentence: gutenberg
Predicted POS Tags: ['PROPN']
Vraies étiquettes POS: ['PROPN']
Test Accuracy : 0.9406
