#### Run Main First

In [1]:
%run Main.ipynb

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score
import torch.nn.functional as F
import json
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import gensim.downloader
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
pretrained_wv = gensim.downloader.load('glove-twitter-100')

In [3]:
from tqdm import tqdm

def document_vector(doc, wv):
  """Create document vectors by averaging word vectors."""
  words = word_tokenize(doc)
  word_vectors = np.array([wv[word] for word in words if word in wv])
  
  if len(word_vectors) == 0:
      return np.zeros(wv.vector_size)
  return np.mean(word_vectors, axis=0)
  

X = np.array([document_vector(text, pretrained_wv) for text in tqdm(X)])

100%|██████████| 170898/170898 [03:25<00:00, 832.25it/s] 


In [4]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create TensorDataset objects
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Create DataLoader objects
batch_size = 128
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [13]:
import torch.nn.init as init
class MultiLayerPerceptron2(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_classes):
        super(MultiLayerPerceptron2, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden_dim*2)
        self.norm1 = nn.BatchNorm1d(hidden_dim * 2)
        self.dropout1 = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim*2)
        self.norm2 = nn.BatchNorm1d(hidden_dim*2)
        self.dropout2 = nn.Dropout(p=0.1)
        self.fc3 = nn.Linear(hidden_dim*2, hidden_dim)
        self.dropout3 = nn.Dropout(p=0.1)
        self.fc4 = nn.Linear(hidden_dim, num_classes)
        
        # Initialisation des poids
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.kaiming_uniform_(m.weight)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = self.norm1(x)
        x = self.dropout1(x)
        x = F.leaky_relu(self.fc2(x))
        x = self.norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.fc3(x))
        x = self.dropout3(x)
        return self.fc4(x)
"""
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(Classifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        embedded = self.embedding(x.long())
        embedded = self.dropout(embedded)
        lstm_output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return torch.sigmoid(self.fc(hidden.squeeze(0)))
"""
num_epochs = 20
vocab_size = X_tensor.shape[1]
num_classes = len(set(y.values))

model = MultiLayerPerceptron2(vocab_size, 2, num_classes) #0.87 0.87 0.87
#model = Classifier(vocab_size, 2, vocab_size, num_classes) #
print(model)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.005)

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs = inputs.float()
        labels = labels.long()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    model.eval()
    correct = 0
    total = 0
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for inputs, Value in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_labels.extend(Value)
            all_predictions.extend(predicted)

    precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_predictions, average='weighted')
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

MultiLayerPerceptron2(
  (fc1): Linear(in_features=100, out_features=4, bias=True)
  (norm1): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=4, out_features=4, bias=True)
  (norm2): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.1, inplace=False)
  (fc3): Linear(in_features=4, out_features=2, bias=True)
  (dropout3): Dropout(p=0.1, inplace=False)
  (fc4): Linear(in_features=2, out_features=2, bias=True)
)
Epoch [1/20], Loss: 0.3013
Precision: 0.8728, Recall: 0.8430, F1 Score: 0.8479
Epoch [2/20], Loss: 0.1620
Precision: 0.8787, Recall: 0.8797, F1 Score: 0.8790
Epoch [3/20], Loss: 0.1292
Precision: 0.8760, Recall: 0.8772, F1 Score: 0.8741
Epoch [4/20], Loss: 0.2647
Precision: 0.8876, Recall: 0.8883, F1 Score: 0.8879
Epoch [5/20], Loss: 0.4104
Precision: 0.8924, Recall: 0.8920, F1 Score: 0.8922
Epoch [6/20], Loss: 0.4332
Precisi

In [20]:
# Calcul du vecteur de document et ajout d'une dimension
doc_vector = document_vector("Je suis", pretrained_wv)
doc_vector = torch.tensor(doc_vector, dtype=torch.float32).unsqueeze(0)  # Ajout d'une dimension avec unsqueeze(0)

# Passage au modèle
with torch.no_grad():
    model.eval()
    output = model(doc_vector)

# Appliquer softmax pour obtenir des probabilités
probabilities = F.softmax(output, dim=1)  # Utilisation de dim=1 car nous voulons appliquer softmax sur les classes

# Afficher les probabilités
print("Probabilités de chaque classe : Humain : {}, GPT {}", probabilities.data)

Probabilités de chaque classe : Humain : {}, GPT {} tensor([[6.0200e-42, 1.0000e+00]])
