In [3]:
import torch.optim as optim
import gensim
import nltk
from nltk.tokenize import word_tokenize
from modules.preprocess import *
from modules.utils import build_dataset, text_to_word2vec, euclid_dis, contrastive_loss, calculate_accuracy, train_epoch, eval_model
from modules.dataloader import PairedWord2VecDataset
from modules.model import BaseNet1D, SiameseNetwork
import gensim.downloader as api
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import random
from modules.rnn_model import BaseNetRNN, SiameseRNN
from modules.transformer_model import BaseNetTransformer, SiameseTransformer

In [6]:
base_net = BaseNetTransformer(embedding_dim=300, hidden_dim=64, num_layers=1, out_features=32)
siamese_model = SiameseTransformer(base_net)

model_path = "best_model.pth"
checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
state_dict = {key.replace("module.", ""): value for key, value in checkpoint.items()}
siamese_model.load_state_dict(state_dict)



<All keys matched successfully>

In [9]:
base_net = siamese_model.base_network

In [14]:
data1 = torch.rand(1,300,10000)

In [4]:
dataset = build_dataset(path="data", num_samples=500, rnd_state=10)

In [5]:
dataset = text_edit(dataset, grp_num=False, rm_newline=True, rm_punctuation=True, lowercase=True, lemmatize=False, html_=True, expand=True)

In [6]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports']]

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [8]:
model_name = 'fasttext-wiki-news-subwords-300'
word2vec_model = api.load(model_name)
text = "Ceci est un texte exemple"
vector = text_to_word2vec(text, word2vec_model)

In [15]:
train_dataset = PairedWord2VecDataset(X_train, Y_train, text_to_word2vec, word2vec_model, 50)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

test_dataset = PairedWord2VecDataset(X_test, Y_test, text_to_word2vec, word2vec_model, 25)
test_loader = DataLoader(test_dataset, batch_size=4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = optim.RMSprop(siamese_model.parameters(), lr=0.01)

In [16]:
epochs = 10
best_accuracy = 0
for epoch in range(epochs):
    train_loss = train_epoch(siamese_model, train_loader, optimizer, device)
    val_accuracy = eval_model(siamese_model, train_loader, device)
    print(f"Epoch {epoch}, Train Loss: {train_loss}, Validation Accuracy: {val_accuracy}")
    
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(siamese_model.state_dict(), 'best_model.pth')
        print("Model saved as best model")

Epoch 0, Train Loss: 23.029746296314094, Validation Accuracy: 0.52
Model saved as best model
Epoch 1, Train Loss: 0.3612966663562335, Validation Accuracy: 0.52
Epoch 2, Train Loss: 0.27961852573431456, Validation Accuracy: 0.52
Epoch 3, Train Loss: 0.3258175185093513, Validation Accuracy: 0.52


KeyboardInterrupt: 