**IMPORT LIBRARIES**

In [1]:
!pip install -q faiss-cpu
!pip install -q tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import faiss
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

**READ THE DATASET**

In [3]:
DATASET_PATH = "/content/SPAM text message 20170820 - Data.csv"
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Split messages and labels
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

**EMBEDDING MODEL AND VECTOR DATABASE**

**1. Embedding Model**

In [5]:
# Model name
#MODEL_NAME = "intfloat/multilingual-e5-base"
MODEL_NAME = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Set device (for GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPU!")
    # Using 2 T4 GPU!
    model = nn.DataParallel(model)

model.to(device)
model.eval()

# Getting quality sentence embedding
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(
        ~attention_mask[..., None].bool(), 0.0
    )

    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

**2. Vectorize Database and Initialize Metadata**

In [6]:
# Create sentence embeddings
def get_embeddings(texts, model, tokenizer, device, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc='Generating embeddings'):
        batch_texts = texts[i:i+batch_size]
        batch_texts_with_prefix = [f"passage: {text}" for text in batch_texts]

        batch_dict = tokenizer(
            batch_texts_with_prefix, max_length=512,
            padding=True, truncation=True, return_tensors="pt"
        )

        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(
                outputs.last_hidden_state, batch_dict["attention_mask"]
            )
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)

# Prepare labels
le = LabelEncoder()
y = le.fit_transform(labels)

# Get embeddings for all messages
X_embeddings = get_embeddings(messages, model, tokenizer, device)

# Meta for each document
metadata = [
    {"index": i, "message": message, "label": label, "label_encoded": y[i]}
    for i, (message, label) in enumerate(zip(messages, labels))
]

Generating embeddings: 100%|██████████| 175/175 [01:13<00:00,  2.38it/s]


**3. Vector Database and Data Split**

In [7]:
# Train/Test split
TEST_SIZE = 0.1
SEED = 42

train_indices, test_indices = train_test_split(
    range(len(messages)),
    test_size=TEST_SIZE,
    stratify=y,
    random_state=SEED
)

# Split embeddings and metadata according to split indices
X_train_emb = X_embeddings[train_indices]
X_test_emb = X_embeddings[test_indices]
y_train = y[train_indices]
y_test = y[test_indices]

train_metadata = [metadata[i] for i in train_indices]
test_metadata = [metadata[i] for i in test_indices]

# Create FAISS index
embedding_dim = X_train_emb.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(X_train_emb.astype("float32"))

**CLASSIFICATION LOGIC AND EVALUATION**

**1. Classification with KNN**

In [8]:
def classify_with_knn(query_text, model, tokenizer, device, index, train_metadata, k=1):
    """Classification by using KNN"""
    # Get query embedding
    query_with_prefix = f"query: {query_text}"
    batch_dict = tokenizer([query_with_prefix],
                           max_length=512,
                           padding=True,
                           truncation=True,
                           return_tensors="pt")
    batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

    with torch.no_grad():
        outputs = model(**batch_dict)
        query_embedding = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
        query_embedding = F.normalize(query_embedding, p=2, dim=1)
        query_embedding = query_embedding.cpu().numpy().astype("float32")

    # Search in FAISS index
    scores, indices = index.search(query_embedding, k)

    # Get predictions from top-k neighbors
    predictions = []
    neighbor_info = []

    for i in range(k):
        neighbor_idx = indices[0][i]
        neighbor_score = scores[0][i]
        neighbor_label = train_metadata[neighbor_idx]["label"]
        neighbor_message = train_metadata[neighbor_idx]["message"]

        predictions.append(neighbor_label)
        neighbor_info.append({
            "score": float(neighbor_score),
            "label": neighbor_label,
            "message": neighbor_message[:100] + "..." if len(neighbor_message) > 100 else neighbor_message
        })

    # Majority vote for final prediction
    unique_labels, counts = np.unique(predictions, return_counts=True)
    final_prediction = unique_labels[np.argmax(counts)]

    return final_prediction, neighbor_info

**2. Evaluation KNN**

In [9]:
def evaluate_knn_accuracy(test_embeddings, test_labels, test_metadata, index, train_metadata, k_values=[1, 3, 5]):
    """Evaluate accuracy for different k values using precomputed embeddings"""
    results = {}
    all_errors = {}

    for k in k_values:
        correct = 0
        total = len(test_embeddings)
        errors = []

        for i in tqdm(range(total), desc=f"Evaluating k={k}"):
            query_embedding = test_embeddings[i:i+1].astype('float32')
            true_label = test_metadata[i]['label']
            true_message = test_metadata[i]['message']

            # Search in FAISS index
            scores, indices = index.search(query_embedding, k)

            # Get predictions from top-k neighbors
            predictions = []
            neighbor_details = []
            for j in range(k):
                neighbor_idx = indices[0][j]
                neighbor_label = train_metadata[neighbor_idx]['label']
                neighbor_message = train_metadata[neighbor_idx]['message']
                neighbor_score = float(scores[0][j])

                predictions.append(neighbor_label)
                neighbor_details.append({
                    'label': neighbor_label,
                    'message': neighbor_message,
                    'score': neighbor_score
                })

            # Majority vote
            unique_labels, counts = np.unique(predictions, return_counts=True)
            predicted_label = unique_labels[np.argmax(counts)]

            if predicted_label == true_label:
                correct += 1
            else:
                # Collect error information
                error_info = {
                    'index': i,
                    'original_index': test_metadata[i]['index'],
                    'message': true_message,
                    'true_label': true_label,
                    'predicted_label': predicted_label,
                    'neighbors': neighbor_details,
                    'label_distribution': {label: int(count) for label, count in zip(unique_labels, counts)}
                }
                errors.append(error_info)

        accuracy = correct / total
        error_count = total - correct

        results[k] = accuracy
        all_errors[k] = errors

        print(f"Accuracy with k={k}: {accuracy:.4f}")
        print(f"Number of errors with k={k}: {error_count}/{total} ({(error_count/total)*100:.2f}%)")

    return results, all_errors

**EVALUATION ON TEST SET**

In [10]:
%%time
print("Evaluating accuracy on test set...")
accuracy_results, error_results = evaluate_knn_accuracy(
    X_test_emb,
    y_test,
    test_metadata,
    index,
    train_metadata,
    k_values=[1, 3, 5]
)

# Print the results
print("\n" + "="*50)
print("ACCURACY RESULTS")
print("="*50)
for k, accuracy in accuracy_results.items():
    print(f"Top-{k} accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*50)

# Save error analyses to file
import json
from datetime import datetime

error_analysis = {
    "timestamp": datetime.now().isoformat(),
    "model": MODEL_NAME,
    "test_size": len(X_test_emb),
    "accuracy_results": accuracy_results,
    "errors_by_k": {}
}

for k, errors in error_results.items():
    error_analysis["errors_by_k"][f"k_{k}"] = {
        "total_errors": len(errors),
        "error_rate": len(errors) / len(X_test_emb),
        "errors": errors
    }

# Save JSON file records errors
output_file = "error_analysis.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(error_analysis, f, ensure_ascii=False, indent=2)

print(f"\n***Error analysis saved to: {output_file}***")
print()
print(f"***Summary:")
for k, errors in error_results.items():
    print(f"   k={k}: {len(errors)} errors out of {len(X_test_emb)} samples")


Evaluating accuracy on test set...


Evaluating k=1: 100%|██████████| 558/558 [00:01<00:00, 298.09it/s]


Accuracy with k=1: 0.9910
Number of errors with k=1: 5/558 (0.90%)


Evaluating k=3: 100%|██████████| 558/558 [00:02<00:00, 243.49it/s]


Accuracy with k=3: 0.9928
Number of errors with k=3: 4/558 (0.72%)


Evaluating k=5: 100%|██████████| 558/558 [00:02<00:00, 204.65it/s]

Accuracy with k=5: 0.9910
Number of errors with k=5: 5/558 (0.90%)

ACCURACY RESULTS
Top-1 accuracy: 0.9910 (99.10%)
Top-3 accuracy: 0.9928 (99.28%)
Top-5 accuracy: 0.9910 (99.10%)

***Error analysis saved to: error_analysis.json***

***Summary:
   k=1: 5 errors out of 558 samples
   k=3: 4 errors out of 558 samples
   k=5: 5 errors out of 558 samples
CPU times: user 1.96 s, sys: 33.4 ms, total: 1.99 s
Wall time: 6.95 s





**BUILDING PIPELINE**

In [11]:
def spam_classifier_pipeline(user_input, k=3):
    print()
    print(f"***Classifying: '{user_input}'")
    print()
    print(f"***Using top-{k} nearest neighbors")
    print()

    # Get prediction and neighbors
    prediction, neighbors = classify_with_knn(
        user_input, model, tokenizer, device, index, train_metadata, k=k
    )

    # Display results
    print(f"***Prediction: {prediction.upper()}")
    print()

    print("***Top neighbors:")
    for i, neighbor in enumerate(neighbors, 1):
        print(f"{i}. Label: {neighbor['label']} | Score: {neighbor['score']:.4f}")
        print(f"   Message: {neighbor['message']}")
        print()

    # Count label distribution
    labels = [n['label'] for n in neighbors]
    label_counts = {label: labels.count(label) for label in set(labels)}

    return {
        'prediction': prediction,
        'neighbors': neighbors,
        'label_distribution': label_counts
    }

**TEST PIPELINE**

In [12]:
# Test an example
test_examples = [
    "I am actually thinking a way of doing something useful",
    "FREE!! Click here to win $1000 NOW! Limited time offer!",
    # "Hey, can you pick me up at 5pm today?",
    # "URGENT: Your account will be suspended unless you verify your details NOW",
    # "Thanks for the meeting today, let's schedule the next one for next week",
    # "Congratulations! You've won a prize! Call this number to claim it"
]

print("Testing pipeline with different examples:")
print()

for i, example in enumerate(test_examples, 1):
    print(f"\n***Example {i}:")
    result = spam_classifier_pipeline(example, k=3)
    print()

Testing pipeline with different examples:


***Example 1:

***Classifying: 'I am actually thinking a way of doing something useful'

***Using top-3 nearest neighbors

***Prediction: HAM

***Top neighbors:
1. Label: ham | Score: 0.8410
   Message: that would be good … I'll phone you tomo lunchtime, shall I, to organise something?

2. Label: ham | Score: 0.8341
   Message: K, I'll work something out

3. Label: ham | Score: 0.8313
   Message: And how you will do that, princess? :)



***Example 2:

***Classifying: 'FREE!! Click here to win $1000 NOW! Limited time offer!'

***Using top-3 nearest neighbors

***Prediction: SPAM

***Top neighbors:
1. Label: spam | Score: 0.8797
   Message: You have WON a guaranteed £1000 cash or a £2000 prize.To claim yr prize call our customer service re...

2. Label: spam | Score: 0.8744
   Message: You are a £1000 winner or Guaranteed Caller Prize, this is our Final attempt to contact you! To Clai...

3. Label: spam | Score: 0.8744
   Message: You have WON