In [1]:
import numpy as np
import chromadb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,roc_auc_score


In [None]:
DB_PATH = "/opt/airflow/data/chroma_db"
client = chromadb.PersistentClient(path=DB_PATH)

train_col = client.get_collection("avis_train")
test_col  = client.get_collection("avis_test")

train_data = train_col.get(include=["metadatas", "embeddings"])
test_data  = test_col.get(include=["metadatas", "embeddings"])

label_map = {"negative": 0, "neutral": 1, "positive": 2}

In [None]:
import numpy as np
import chromadb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix



X_train = torch.tensor(np.array(train_data["embeddings"]), dtype=torch.float32)
y_train = torch.tensor([label_map[m["label"]] for m in train_data["metadatas"]], dtype=torch.long)

X_test = torch.tensor(np.array(test_data["embeddings"]), dtype=torch.float32)
y_test = torch.tensor([label_map[m["label"]] for m in test_data["metadatas"]], dtype=torch.long)


class AeroMultiScaleCNN(nn.Module):
    def __init__(self):
        super(AeroMultiScaleCNN, self).__init__()
        self.unflatten = nn.Unflatten(1, (1, 384))
        
        self.conv1 = nn.Conv1d(1, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(1, 64, kernel_size=5, padding=2)
        
        self.pool = nn.MaxPool1d(2)
        self.dropout = nn.Dropout(0.4)
        
        
        self.fc = nn.Sequential(
            nn.Linear(128 * 192, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 3)
        )

    def forward(self, x):
        x = self.unflatten(x)
        branch1 = self.pool(F.relu(self.conv1(x)))
        branch2 = self.pool(F.relu(self.conv2(x)))
        
        combined = torch.cat((branch1, branch2), dim=1)
        combined = combined.view(combined.size(0), -1)
        
        return self.fc(self.dropout(combined))


model = AeroMultiScaleCNN()

weights = torch.tensor([0.6, 2.0, 2.2], dtype=torch.float32)
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

print(f"Entraînement lancé sur {len(X_train)} avis...")

for epoch in range(140):
    model.train()
    optimizer.zero_grad()
    
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        print(f"Époque [{epoch+1}/180] | Perte: {loss.item():.4f}")


model.eval()
with torch.no_grad():
    y_pred_raw = model(X_test)
    _, y_pred = torch.max(y_pred_raw, 1)

print("\n" + "="*50)
print("             RAPPORT FINAL AEROSTREAM")
print("="*50)
print(classification_report(y_test.numpy(), y_pred.numpy(), 
                            target_names=['Négatif', 'Neutre', 'Positif']))

print("\nMATRICE DE CONFUSION :")
print(confusion_matrix(y_test.numpy(), y_pred.numpy()))

# Sauvegarde pour l'API REST
torch.save(model.state_dict(), "/opt/airflow/models/best_cnn_aerostream.pt")
print("\nModèle sauvegardé : best_cnn_aerostream.pt")

Entraînement lancé sur 11712 avis...


RuntimeError: unflatten: Provided sizes [1, 384] don't multiply up to the size of dim 1 (1024) in the input tensor

In [None]:
from sklearn.neural_network import MLPClassifier




model = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500, random_state=42)

model.fit(train_data, y_train)


y_pred_train=model.predict(train_data)
y_pred_test=model.predict(test_data)

print("Model Accuracy:\n")
print(f"accuracy_score train:\n{accuracy_score(y_train,y_pred_train)}\n")
print(f"accuracy_score test:\n{accuracy_score(y_test,y_pred_test)}\n")
print("="*50)
print(f"Gab:{accuracy_score(y_train,y_pred_train)-accuracy_score(y_test,y_pred_test)}")
print("="*50)
print("Confusion Matrix:\n")
print(f"confusion_matrix train : \n{confusion_matrix(y_train,y_pred_train)}")
print("="*50)
print(f"confusion_matrix test :\n{confusion_matrix(y_test,y_pred_test)}")
print("="*50)
print("="*50)
print("classification report:\n")
print(f"classification_report train \n:{classification_report(y_train,y_pred_train)}")
print("="*50)
print(f"classification_report test \n:{classification_report(y_test,y_pred_test)}")
print("\n")


Model Accuracy:

accuracy_score train:
0.9956454918032787

accuracy_score test:
0.7807377049180327

Gab:0.21490778688524592
Confusion Matrix:

confusion_matrix train : 
[[7331   10    2]
 [   8 2449   22]
 [   2    7 1881]]
confusion_matrix test :
[[1609  149   77]
 [ 179  353   88]
 [  77   72  324]]
classification report:

classification_report train 
:              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      7343
     neutral       0.99      0.99      0.99      2479
    positive       0.99      1.00      0.99      1890

    accuracy                           1.00     11712
   macro avg       0.99      0.99      0.99     11712
weighted avg       1.00      1.00      1.00     11712

classification_report test 
:              precision    recall  f1-score   support

    negative       0.86      0.88      0.87      1835
     neutral       0.61      0.57      0.59       620
    positive       0.66      0.68      0.67       473

    accuracy   

In [1]:
! pip install faker


Collecting faker
  Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-39.0.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: faker
Successfully installed faker-39.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
