# Support vector machine (SVM)

## 1. Setup project

### 1.1. Environment setup

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

%load_ext dotenv
%dotenv

cannot find .env file


### 1.2. Build train data

In [2]:
from sailor import SailorDataEngineer, RouteGenConfig

_config = RouteGenConfig(
  api_key=os.getenv("AI_API_KEY"), # type: ignore
  model=os.getenv("AI_MODEL"), # type: ignore
  base_url=os.getenv("AI_MODEL_URL"),
  cache_dir="../build/cache")
_engineer = SailorDataEngineer(_config)

_route_context = "flight agency"
routes_context = await _engineer.generate_data(route_context=_route_context, cache_key="train_flight_agency")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

### 1.3. Train test split

In [None]:
from sailor import NavigationContext
from sklearn.model_selection import train_test_split

_train_sessions, _test_sessions = train_test_split(routes_context.sessions, test_size=0.2)

train_context = NavigationContext(routes=routes_context.routes, sessions=_train_sessions)

## 2. Setup model 

In [None]:
import numpy as np
from sailor import RouteVectorizer
from sklearn.svm import LinearSVC

vectorizer = RouteVectorizer()
route_vectors, labels =  vectorizer.fit(train_context)

model = LinearSVC(class_weight="balanced", max_iter=2000)
model.fit(route_vectors, labels)

def recommend_route(query: str, top_n=3):
    query_vec = vectorizer.transform(query)
    scores = model.decision_function(query_vec)[0]
    scores = 1 / (1 + np.exp(-scores))
    sorted_indices = np.argsort(scores)[::-1][:top_n]

    return [(vectorizer.inverse_transform(i), scores[i]) for i in sorted_indices]


## 3. Validate vectorizer 

### 3.1. Test vectorizer

In [None]:
import time

def _test_query(query):
    start_time = time.time()
    results = recommend_route(query)
    latency = (time.time() - start_time)*1000
    print(f"Results ({latency:.2f}ms):")

    for route, score in results:
        print(f"- {route.id} | {route.path} (score: {score:.3f})")

for session in _test_sessions:
    for route in routes_context.routes:
        if route.id == session.route_id:
            break

    query = session.intention.context
    print(f"Query: '{query}'; Expected route: {route.id} | {route.path};")
    _test_query(query)

### 3.2. Analyze size

In [None]:
print("Vocabulary size:", len(vectorizer._vectorizer.vocabulary_))
print("Vector size:", sys.getsizeof(vectorizer.route_vectors))


### 3.2. Analyze route similarity

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(vectorizer.route_vectors)
plt.figure(figsize=(10, 8))
plt.imshow(similarity_matrix, cmap="Blues")
plt.colorbar(label="Similarity Score")
plt.title("Route Similarity Matrix")

_test_routes = [r.path for r in routes_context.routes if r in _test_sessions]

plt.xticks(ticks=range(len(_test_routes)), labels=_test_routes, rotation=90)
plt.yticks(ticks=range(len(_test_routes)), labels=_test_routes)

plt.show()