<a href="https://colab.research.google.com/github/TruongNVMM/Machine-Learning/blob/main/Email_Classification_Vector_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -qq faiss-cpu
!pip install -qq transformers
!pip install -qq tqdm

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import faiss
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [None]:
data_path = "/content/drive/MyDrive/Data Science Projects/Machine Learning Projects/Email_classification/2cls_spam_text_cls.csv"
data = pd.read_csv(data_path)
data.head(5)

In [None]:
messages = data["Message"].values.tolist()
labels = data["Category"].values.tolist()

In [None]:
Model_Name = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(Model_Name)
model = AutoModel.from_pretrained(Model_Name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)/content/drive/MyDrive/Data Science Projects/Machine Learning Projects/Email_classification/
model.eval()

def average_pool(last_hidden_states, attention_mask):
  last_hidden = last_hidden_states.masked_fill(
      ~attention_mask[..., None].bool(), 0.0
  )
  return last_hidden.sum(dim=1)/attention_mask.sum(dim=1)[..., None]

In [None]:
def get_embeddings(texts, model, tokenizer, device, batch_size=32):
  embeddings = []
  for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
    batch_texts = texts[i:i+batch_size]
    batch_texts_with_prefix = [f"passage: {text}" for text in batch_texts]

    batch_dict = tokenizer(batch_texts_with_prefix, max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.to(device) for k,v, in batch_dict.items()}

    with torch.no_grad():
      outputs = model(**batch_dict)
      batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
      batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
      embeddings.append(batch_embeddings.cpu().numpy())

  return np.vstack(embeddings)


lb = LabelEncoder()
y = lb.fit_transform(labels)

X_embeddings = get_embeddings(messages, model, tokenizer, device)

metadata = [{"index":i, "message":message, "label":label, "label_encoded":y[i]}
            for i, (message, label) in enumerate(zip(messages, labels))]

In [None]:
metadata

In [None]:
Test_Size = 0.1
SEED = 42

train_indices, test_indices = train_test_split(range(len(messages)), test_size=Test_Size, stratify=y, random_state=SEED)
X_train_emb = X_embeddings[train_indices]
X_test_emb = X_embeddings[test_indices]

train_metadata = [metadata[i] for i in train_indices]
test_metadata = [metadata[i] for i in test_indices]

embedding_dim = X_train_emb.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(X_train_emb.astype("float32"))

In [None]:
test_metadata

In [None]:
def classify_knn(query_text, model, tokenizer, device, index, train_metadata, k=1):
  query_with_predix = f"query: {query_text}"
  batch_dict = tokenizer([query_with_predix], max_length=512, padding=True, truncation=True, return_tensors="pt")
  batch_dict = {k: v.to(device) for k,v in batch_dict.items()}

  with torch.no_grad():
    outputs = model(**batch_dict)
    query_embedding = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
    query_embedding = F.normalize(query_embedding, p=2, dim=1)
    query_embedding = query_embedding.cpu().numpy().astype("float32")

  scores, indices = index.search(query_embedding, k)

  predictions = []
  neighbor_info = []

  for i in range(k):
    neighbor_idx = indices[0][i]
    neighbor_score = scores[0][i]
    neighbor_label = train_metadata[neighbor_idx]["label"]
    neighbor_message = train_metadata[neighbor_idx]["message"]

    predictions.append(neighbor_label)
    neighbor_info.append(
        {
            "score": float(neighbor_score),
            "label": neighbor_label,
            "message": neighbor_message[:100] + "..." if len(neighbor_message) > 100 else neighbor_message
        }
    )

    unique_labels, counts = np.unique(predictions, return_counts=True)
    final_prediction = unique_labels[np.argmax(counts)]

    return final_prediction, neighbor_info

In [None]:
def evaluate_knn_accuracy(test_embeddings, test_metadata, index, train_metadata, k_values=[1, 3, 5]):
  results = {}
  all_errors = {}

  for k in k_values:
    correct = 0
    total = len(test_embeddings)
    errors = []

    for i in tqdm(range(total), desc=f"Evaluating k={k}"):
      query_embedding = test_embeddings[i:i+1].astype("float32")
      true_label = test_metadata[i]["label"]
      true_message = test_metadata[i]["message"]

      scores, indices = index.search(query_embedding, k)

      predictions = []
      neighbor_details = []

      for j in range(k):
        neighbor_idx = indices[0][j]
        neighbor_label = train_metadata[neighbor_idx]["label"]
        neighbor_message = train_metadata[neighbor_idx]["message"]
        neighbor_score = float(scores[0][j])

        predictions.append(neighbor_label)
        neighbor_details.append(
            {
                "label": neighbor_label,
                "message": neighbor_message,
                "score": neighbor_score
            }
        )

        unique_labels, counts = np.unique(predictions, return_counts=True)
        predicted_label = unique_labels[np.argmax(counts)]

        if predicted_label == true_label:
          correct += 1
        else:
          error_info = {
              "index": i,
              "original_index": test_metadata[i]["index"],
              "message": true_message,
              "predicted_label": predicted_label,
              "neighbors": neighbor_details,
              "label_distribution": {label: int(count) for label, count in zip(unique_labels, counts)}
          }

          errors.append(error_info)
    accuracy = correct / total
    error_count = total - correct

    results[k] = accuracy
    all_errors[k] = errors

    print(f"Accuracy with k={k}: {accuracy:.4f}")
    print(f"Number of errors with k={k}: {error_count}/{total} ({(error_count/total)*100:.2f}%)")

  return results, all_errors

In [None]:
%%time
print("Evaluating accuracy on test set...")
accuracy_results, error_results = evaluate_knn_accuracy(X_test_emb, test_metadata, index, train_metadata, k_values=[1, 3, 5])

print("\n" + "="*50)
print("Accuracy Results")
print("="*50)

for k, accuracy in accuracy_results.items():
  print(f"Top-{k} accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*50)

import json
from datetime import datetime

error_analysis = {
    "timestamp": datetime.now().isoformat(),
    "model": Model_Name,
    "test_size": len(X_test_emb),
    "accuracy_results": accuracy_results,
    "errors_by_k": {}
}

for k,errors in error_results.items():
  error_analysis["errors_by_k"][f"k_{k}"] = {
      "total_errors": len(errors),
      "error_rate": len(errors)/len(X_test_emb),
      "errors": errors
  }

output_file = "/content/drive/MyDrive/Data Science Projects/Machine Learning Projects/Email_classification/error_analysis.json"
with open(output_file, "w", encoding="utf-8") as f:
  json.dump(error_analysis, f, ensure_ascii=False, indent=2)

print(f"\n***Error analysis saved to: {output_file}***")
print()
print(f"***Summary:")
for k, errors in error_results.items():
  print(f" k={k}: {len(errors)} errors out of {len(X_test_emb)} samples")