### Build Environment and Download Database

In [39]:
!pip install -qq faiss-cpu
!pip install --upgrade transformers==4.39.3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [40]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import faiss
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

### Reading and Preparing Data

In [41]:
DATASET_PATH = "./Data/cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)

# Splits messages and label into lists
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

### Preparing embedding model

In [42]:
# Load embedding model
MODEL_NAME = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(
        ~attention_mask[..., None].bool(), 0.0
    )
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]



### Vectorize Data and Generate Metadata

In [43]:
# Create sentence embeddings
def get_embeddings(texts, model, tokenizer, device, batch_size=32):
    """Create embedding for a list of documents"""
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        batch_texts_with_prefix = [f"passage: {text}" for text in batch_texts]
        batch_dict = tokenizer(batch_texts_with_prefix, max_length=512,
                               padding=True, truncation=True, return_tensors="pt")
        batch_dict = {k: v.to(device) for k,v in batch_dict.items()}
        
        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())

    return np.vstack(embeddings)


# Preparing labels    
le = LabelEncoder()
y = le.fit_transform(labels)

# Create embeddings for all messages
X_embeddings = get_embeddings(messages, model, tokenizer, device)

# Creae metadata for each document
metadata = [{"index": i, "Message": message, "label": label, "label_encoded": y[i]}
            for i, (message, label) in enumerate(zip(messages, labels))]



Generating embeddings: 100%|██████████| 175/175 [01:33<00:00,  1.87it/s]


In [44]:
X_embeddings.shape

(5572, 768)

In [45]:
X_embeddings[:1].shape

(1, 768)

In [51]:
metadata[2]

{'index': 2,
 'Message': "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'label': 'spam',
 'label_encoded': 1}

### Building Vector Databases and Splitting Data

In [55]:
# Generating FAISS index and splitting data
TEST_SIZE = 0.1
SEED = 42

train_indices, test_indices = train_test_split(
    range(len(messages)), test_size=TEST_SIZE, stratify=y, random_state=SEED
)

# Split embeddings and metadata by split index
X_train_emb = X_embeddings[train_indices]
X_test_emb = X_embeddings[test_indices]
train_metadata = [metadata[i] for i in train_indices]
test_metadata = [metadata[i] for i in test_indices]

# Split labels
y_train = [y[i] for i in train_indices]
y_test = [y[i] for i in test_indices]

# Create FAISS index
embedding_dim = X_train_emb.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(X_train_emb.astype("float32"))




### Building Classification and Evaluation Logic

In [56]:
# Implementing classification with embedding similarity
def classify_with_knn(query_text, model, tokenizer, device, index, train_metadata, k=1):

    """Classify text using k-nearnest neighbors with embeddings"""

    # Get query embedding
    query_with_prefix = f"query: {query_text}"
    batch_dict = tokenizer([query_with_prefix],
                           max_length=512,
                           padding=True,
                           truncation=True,
                           return_tensors="pt")
    
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = model(**batch_dict)
        query_embedding = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
        query_embedding = F.normalize(query_embedding, p=2, dim=1)
        query_embedding = query_embedding.cpu().numpy().astype("float32")

    # Search in FAISS index
    scores, indices = index.search(query_embedding, k)

    # Get predictions from top-k neighbors
    predictions = []
    neighbor_info = []

    for i in range(k):
        neighbor_idx = indices[0][i]
        neighbor_score = scores[0][i]
        neighbor_label = train_metadata[neighbor_idx]["label"]
        neighbor_message = train_metadata[neighbor_idx]["Message"]

        predictions.append(neighbor_label)
        neighbor_info.append({
            "score": float(neighbor_score),
            "label": neighbor_label,
            "message": neighbor_message[:100] + "..." if len(neighbor_message) > 100 else neighbor_message
        })

    # Majority vote for final prediction
    unique_labels, counts = np.unique(predictions, return_counts=True)
    final_prediction = unique_labels[np.argmax(counts)]
    return final_prediction, neighbor_info

def evaluate_knn_accuracy(test_embeddings, test_labels, test_metadata, index,
                          train_metadata, k_values=[1, 3, 5]):
    """
    Evaluate accuracy for different k values using precomputed embeddings
    """
    results = {}
    all_errors = {}

    for k in k_values:
        correct = 0
        total = len(test_embeddings)
        errors = []

        for i in tqdm(range(total), desc=f"Evaluating k={k}"):
            query_embedding = test_embeddings[i:i+1].astype("float32")
            true_label = test_metadata[i]["label"]
            true_message = test_metadata[i]["Message"]

            # Search in FAISS index
            scores, indices = index.search(query_embedding, k)

            # Get predictions from top-k neighbors
            predictions = []
            neighbor_details = []
            for j in range(k):
                neighbor_idx = indices[0][j]
                neighbor_label = train_metadata[neighbor_idx]["label"]
                neighbor_message = train_metadata[neighbor_idx]["Message"]
                neighbor_score = float(scores[0][j])

                predictions.append(neighbor_label)
                neighbor_details.append({
                    "label": neighbor_label,
                    "message": neighbor_message,
                    "score": neighbor_score
                })
            
            # Majority vote
            unique_labels, counts = np.unique(predictions, return_counts=True)
            predicted_label = unique_labels[np.argmax(counts)]

            if predicted_label == true_label:
                correct += 1
            else:
                # Collect error information
                error_info = {
                    "index": i,
                    "original_index": test_metadata[i]["index"],
                    "message": true_message,
                    "true_label": true_label,
                    "predicted_label": predicted_label,
                    "neighbors": neighbor_details,
                    "label_distribution": {label: int(count) for label, count in zip(unique_labels, counts)}
                }
                errors.append(error_info)

        accuracy = correct / total
        error_count = total - correct

        results[k] = accuracy
        all_errors[k] = errors

        print(f"Accuracy with k={k}: {accuracy:.4f}")
        print(f"Number of errors with k={k}: {error_count}/{total} ({(error_count/total)*100:.2f}%)")

    return results, all_errors


### Model Evaluation on Test Set

In [57]:
# Accuracy Evaluation on test set
import time
start_time = time.time()

print("Evaluating accuracy on test set...")
accuracy_results, error_results = evaluate_knn_accuracy(
    X_test_emb,
    y_test,
    test_metadata,
    index,
    train_metadata,
    k_values=[1, 3, 5]
)

# Show results
end_time = time.time()
print(f"\nEvaluation completed in {end_time - start_time:.2f} seconds")
print("\n" + "="*50)
print("ACCURACY RESULTS")
print("="*50)
for k, accuracy in accuracy_results.items():
    print(f"Top-{k} accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*50)

# Save error analysis to file
import json
from datetime import datetime

error_analysis = {
    "timestamp": datetime.now().isoformat(),
    "model": MODEL_NAME,
    "test_size": len(X_test_emb),
    "accuracy_results": accuracy_results,
    "errors_by_k": {}
}

for k, errors in error_results.items():
    error_analysis["errors_by_k"][f"k={k}"] = {
        "total_errors": len(errors),
        "error_rate": len(errors) / len(X_test_emb),
        "errors": errors
    }

# Save JSON file with error log
output_file = "error_analysis.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(error_analysis, f, ensure_ascii=False, indent=2)

print(f"\n***Error analysis saved to: {output_file}***")
print("\n***Summary:")
for k, errors in error_results.items():
    print(f"  k={k}: {len(errors)} errors out of {len(X_test_emb)} samples")


Evaluating accuracy on test set...


Evaluating k=1: 100%|██████████| 558/558 [00:00<00:00, 2916.02it/s]


Accuracy with k=1: 0.9857
Number of errors with k=1: 8/558 (1.43%)


Evaluating k=3: 100%|██████████| 558/558 [00:00<00:00, 2881.30it/s]


Accuracy with k=3: 0.9928
Number of errors with k=3: 4/558 (0.72%)


Evaluating k=5: 100%|██████████| 558/558 [00:00<00:00, 2988.13it/s]

Accuracy with k=5: 0.9910
Number of errors with k=5: 5/558 (0.90%)

Evaluation completed in 0.58 seconds

ACCURACY RESULTS
Top-1 accuracy: 0.9857 (98.57%)
Top-3 accuracy: 0.9928 (99.28%)
Top-5 accuracy: 0.9910 (99.10%)

***Error analysis saved to: error_analysis.json***

***Summary:
  k=1: 8 errors out of 558 samples
  k=3: 4 errors out of 558 samples
  k=5: 5 errors out of 558 samples





### Build a Complete Classification Pipeline

In [58]:
# Pipeline classification cho user input
def spam_classifier_pipeline(user_input, k=3):
    """
    Complete pipeline for spam classification

    Args:
        user_input (str): Text to classify
        k (int): Number of nearest neighbors to consider

    Returns:
        dict: Classification results with details
    """
    print()
    print(f"***Classifying: \"{user_input}\"")
    print()
    print(f"***Using top-{k} nearest neighbors")
    print()

    # Get prediction and neighbors
    prediction, neighbors = classify_with_knn(
        user_input, model, tokenizer, device, index, train_metadata, k=k
    )

    # Display results
    print(f"***Prediction: {prediction.upper()}")
    print()

    print("***Top neighbors:")
    for i, neighbor in enumerate(neighbors, 1):
        print(f"{i}. Label: {neighbor['label']} | Score: {neighbor['score']:.4f}")
        print(f"   Message: {neighbor['message']}")
        print()

    # Count label distribution
    labels = [n["label"] for n in neighbors]
    label_counts = {label: labels.count(label) for label in set(labels)}

    return {
        "prediction": prediction,
        "neighbors": neighbors,
        "label_distribution": label_counts
    }


### Pipeline Testing

In [59]:
# Test pipeline with different examples
test_examples = [
    "I am actually thinking a way of doing something useful",
    "FREE!! Click here to win \$1000 NOW! Limited time offer!",
]

for i, example in enumerate(test_examples, 1):
    print(f"\n--- Example {i}: \"{example}\" ---")
    result = spam_classifier_pipeline(example, k=3)

# Interactive testing – người dùng có thể thay đổi text và k value
print("\n--- Interactive Testing ---")
user_text = "Win a free iPhone! Click here now!"
k_value = 5
result = spam_classifier_pipeline(user_text, k=k_value)



--- Example 1: "I am actually thinking a way of doing something useful" ---

***Classifying: "I am actually thinking a way of doing something useful"

***Using top-3 nearest neighbors

***Prediction: HAM

***Top neighbors:
1. Label: ham | Score: 0.8424
   Message: yeah, that's what I was thinking

2. Label: ham | Score: 0.8412
   Message: that would be good … I'll phone you tomo lunchtime, shall I, to organise something?

3. Label: ham | Score: 0.8344
   Message: See? I thought it all through


--- Example 2: "FREE!! Click here to win \$1000 NOW! Limited time offer!" ---

***Classifying: "FREE!! Click here to win \$1000 NOW! Limited time offer!"

***Using top-3 nearest neighbors

***Prediction: SPAM

***Top neighbors:
1. Label: spam | Score: 0.8566
   Message: Win a £1000 cash prize or a prize worth £5000

2. Label: spam | Score: 0.8499
   Message: FREE entry into our £250 weekly competition just text the word WIN to 80086 NOW. 18 T&C www.txttowin...

3. Label: spam | Score: 0.8489
  