In [1]:
import json
import faiss
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load device specs
dataset = load_dataset("json", data_files="gsmarena_data.json")
devices = dataset["train"]
print(dataset['train'][0])

# Setting up the model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

{'brand': 'tcl', 'device': 'TCL 30 V 5G', 'specifications': {'Battery': [{'name': 'Type', 'value': 'Li-Po 4500 mAh'}, {'name': 'Charging', 'value': '18W wired'}], 'Body': [{'name': 'Dimensions', 'value': '165.9 x 76 x 9.1 mm (6.53 x 2.99 x 0.36 in)'}, {'name': 'Weight', 'value': '200.4 g (7.09 oz)'}, {'name': 'Build', 'value': 'Glass front, plastic frame, plastic back'}, {'name': 'SIM', 'value': 'Nano-SIM'}], 'Camera': None, 'Comms': [{'name': 'WLAN', 'value': 'Wi-Fi 802.11 a/b/g/n/ac, dual-band, Wi-Fi Direct'}, {'name': 'Bluetooth', 'value': '5.1, A2DP, LE'}, {'name': 'Positioning', 'value': 'GPS'}, {'name': 'NFC', 'value': 'No'}, {'name': 'Radio', 'value': 'No'}, {'name': 'USB', 'value': 'USB Type-C 3.0, OTG'}], 'Display': [{'name': 'Type', 'value': 'IPS LCD'}, {'name': 'Size', 'value': '6.67 inches, 107.4 cm2 (~85.2% screen-to-body ratio)'}, {'name': 'Resolution', 'value': '1080 x 2400 pixels, 20:9 ratio (~395 ppi density)'}, {'name': 'Protection', 'value': 'Corning Gorilla Glass 3'

In [3]:
def format_spec(device):
    specs = []
    for category in device["specifications"]:
        lst = device['specifications'][category]
        if lst is not None:
            for obj in lst:
                specs.append(f"{obj['name']}: {obj['value']}")
    return "\n".join(specs)

device_texts = [format_spec(d) for d in devices]
print(device_texts)

['Type: Li-Po 4500 mAh\nCharging: 18W wired\nDimensions: 165.9 x 76 x 9.1 mm (6.53 x 2.99 x 0.36 in)\nWeight: 200.4 g (7.09 oz)\nBuild: Glass front, plastic frame, plastic back\nSIM: Nano-SIM\nWLAN: Wi-Fi 802.11 a/b/g/n/ac, dual-band, Wi-Fi Direct\nBluetooth: 5.1, A2DP, LE\nPositioning: GPS\nNFC: No\nRadio: No\nUSB: USB Type-C 3.0, OTG\nType: IPS LCD\nSize: 6.67 inches, 107.4 cm2 (~85.2% screen-to-body ratio)\nResolution: 1080 x 2400 pixels, 20:9 ratio (~395 ppi density)\nProtection: Corning Gorilla Glass 3\nSensors: Fingerprint (rear-mounted), accelerometer, proximity, compass, gyro, barometer\nAnnounced: 2022, January 04\nStatus: Available. Released 2022, January 28\nTriple: 50 MP, (wide), PDAF\r\n5 MP, (ultrawide)\r\n2 MP, (macro)\nFeatures: LED flash, HDR, panorama\nVideo: 1080p@30fps\nCard slot: microSDXC (dedicated slot)\nInternal: 128GB 4GB RAM\nColors: Midnight Gray\nPrice: About 270 EUR\nTechnology: GSM / HSPA / LTE / 5G\n2G bands: GSM 850 / 900 / 1800 / 1900\n3G bands: HSDPA 

In [4]:
device_texts

['Type: Li-Po 4500 mAh\nCharging: 18W wired\nDimensions: 165.9 x 76 x 9.1 mm (6.53 x 2.99 x 0.36 in)\nWeight: 200.4 g (7.09 oz)\nBuild: Glass front, plastic frame, plastic back\nSIM: Nano-SIM\nWLAN: Wi-Fi 802.11 a/b/g/n/ac, dual-band, Wi-Fi Direct\nBluetooth: 5.1, A2DP, LE\nPositioning: GPS\nNFC: No\nRadio: No\nUSB: USB Type-C 3.0, OTG\nType: IPS LCD\nSize: 6.67 inches, 107.4 cm2 (~85.2% screen-to-body ratio)\nResolution: 1080 x 2400 pixels, 20:9 ratio (~395 ppi density)\nProtection: Corning Gorilla Glass 3\nSensors: Fingerprint (rear-mounted), accelerometer, proximity, compass, gyro, barometer\nAnnounced: 2022, January 04\nStatus: Available. Released 2022, January 28\nTriple: 50 MP, (wide), PDAF\r\n5 MP, (ultrawide)\r\n2 MP, (macro)\nFeatures: LED flash, HDR, panorama\nVideo: 1080p@30fps\nCard slot: microSDXC (dedicated slot)\nInternal: 128GB 4GB RAM\nColors: Midnight Gray\nPrice: About 270 EUR\nTechnology: GSM / HSPA / LTE / 5G\n2G bands: GSM 850 / 900 / 1800 / 1900\n3G bands: HSDPA 

In [5]:
device_embeddings = model.encode(device_texts, show_progress_bar=True)

# Create FAISS index
dimension = device_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(device_embeddings)
index.add(device_embeddings)

Batches: 100%|██████████| 53/53 [01:33<00:00,  1.77s/it]


In [50]:
def search_devices(query, battery_min=None, k=5):
    # Semantic search
    query_embed = model.encode([query])
    D, I = index.search(query_embed, k)
    print(D, I)
    # Apply filters
    results = []
    # print()
    for idx in I[0]:
        device = devices[idx.item()]
        if battery_min and device['specifications']['Battery'] < battery_min:
            continue
        results.append(device)

    return results[:k]
    # return None


In [7]:
class Query(BaseModel):
    text: str
    max_price: float = None
    min_battery: int = None

In [51]:
results = search_devices(
    "rugged phone with great battery life",
    battery_min=5000
)

[[0.5593261  0.55302477 0.54663783 0.5276793  0.522277  ]] [[314 318 317   9 315]]


TypeError: '<' not supported between instances of 'list' and 'int'

In [44]:
print(results)

[]


In [52]:
devices[314]['specifications']['Battery'][0]

{'name': 'Type', 'value': 'Removable Li-Ion 680 mAh battery'}