#### Run Main First

In [1]:
%run Main.ipynb

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score
import torch.nn.functional as F
import json
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import gensim.downloader
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
pretrained_wv = gensim.downloader.load('glove-twitter-100')

In [4]:
from tqdm import tqdm

def document_vector(doc, wv):
  """Create document vectors by averaging word vectors."""
  words = word_tokenize(doc)
  word_vectors = np.array([wv[word] for word in words if word in wv])
  
  if len(word_vectors) == 0:
      return np.zeros(wv.vector_size)
  return np.mean(word_vectors, axis=0)
  

X = np.array([document_vector(text, pretrained_wv) for text in tqdm(X)])

100%|██████████| 170898/170898 [04:12<00:00, 677.12it/s]


In [7]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create TensorDataset objects
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Create DataLoader objects
batch_size = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [12]:
class MultiLayerPerceptron2(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_classes):
        super(MultiLayerPerceptron2, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden_dim*2)
        self.dropout1 = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim)
        self.dropout2 = nn.Dropout(p=0.1)
        self.fc3 = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        return self.fc3(x)

num_epochs = 20
vocab_size = X_tensor.shape[1]
num_classes = len(set(y.values))

model = MultiLayerPerceptron2(vocab_size, 2, num_classes)
print(model)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.005)

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs = inputs.float()
        labels = labels.long()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    correct = 0
    total = 0
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for inputs, Value in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_labels.extend(Value)
            all_predictions.extend(predicted)

    precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

MultiLayerPerceptron2(
  (fc1): Linear(in_features=100, out_features=4, bias=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=4, out_features=2, bias=True)
  (dropout2): Dropout(p=0.1, inplace=False)
  (fc3): Linear(in_features=2, out_features=2, bias=True)
)
Epoch [1/20], Loss: 0.8531
Precision: 0.8543, Recall: 0.8557, F1 Score: 0.8507
Epoch [2/20], Loss: 0.0963
Precision: 0.8739, Recall: 0.8756, F1 Score: 0.8741
Epoch [3/20], Loss: 0.3717
Precision: 0.8799, Recall: 0.8804, F1 Score: 0.8801
Epoch [4/20], Loss: 0.1232
Precision: 0.8798, Recall: 0.8812, F1 Score: 0.8802
Epoch [5/20], Loss: 0.3332
Precision: 0.8800, Recall: 0.8816, F1 Score: 0.8800
Epoch [6/20], Loss: 0.0794
Precision: 0.8870, Recall: 0.8842, F1 Score: 0.8851
Epoch [7/20], Loss: 0.1277
Precision: 0.8864, Recall: 0.8860, F1 Score: 0.8862
Epoch [8/20], Loss: 0.1637
Precision: 0.8875, Recall: 0.8885, F1 Score: 0.8878
Epoch [9/20], Loss: 0.3319
Precision: 0.8847, Recall: 0.8860, F1 Score: 0.8841


In [49]:
with torch.no_grad():
    model.eval()
    output = model(torch.tensor(document_vector("Oui, les lapins peuvent être blancs ! En fait, il existe de nombreuses races de lapins qui présentent une variété de couleurs de pelage, y compris le blanc. Le lapin blanc est l'une des couleurs les plus courantes parmi les races de lapins domestiques. De plus, dans la nature, certaines espèces de lapins sauvages peuvent également avoir des variations de couleur qui incluent le blanc.", pretrained_wv), dtype=torch.float32))

import torch.nn.functional as F

# Appliquer softmax pour obtenir des probabilités
probabilities = F.softmax(output, dim=0)

# Afficher les probabilités
print("Probabilités de chaque classe :", probabilities)

Probabilités de chaque classe : tensor([0.3181, 0.6819])
