In [3]:
from datasets import load_dataset

categories_dataset = load_dataset("csv",data_files="Categories_FikseHub.csv",split="train")

print(categories_dataset)

Dataset({
    features: ['Type of Repairer', 'Type of category', 'Type of garment in category', 'Service', 'Description', 'Price', 'Estimated time in hours'],
    num_rows: 257
})


In [4]:
from transformers import AutoTokenizer, AutoModel

model_checkpoint= "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)

In [5]:
def concatenate_text(examples):
    # Join all relevant columns into a single text string per example (batched)
    texts = []
    for i in range(len(examples["Type of Repairer"])):
        text = (
            str(examples["Type of Repairer"][i]) + "\n"
            + str(examples["Type of category"][i]) + "\n"
            + str(examples["Type of garment in category"][i]) + "\n"
            + str(examples["Service"][i]) + "\n"
            + str(examples["Description"][i]) + "\n"
            + str(examples["Price"][i]) + "\n"
            + str(examples["Estimated time in hours"][i])
        )
        texts.append(text)
    return {"text": texts}

searching_dataset = categories_dataset.map(concatenate_text, batched=True)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

def embed_batch(batch):
    embeddings_tensor = get_embeddings(batch["text"])
    embeddings_np = embeddings_tensor.detach().cpu().numpy()
    return {"embeddings": embeddings_np}

searching_dataset = searching_dataset.map(embed_batch, batched=True, batch_size=16)


Map:   0%|          | 0/257 [00:00<?, ? examples/s]

Map:   0%|          | 0/257 [00:00<?, ? examples/s]

In [6]:
print(searching_dataset)

Dataset({
    features: ['Type of Repairer', 'Type of category', 'Type of garment in category', 'Service', 'Description', 'Price', 'Estimated time in hours', 'text', 'embeddings'],
    num_rows: 257
})


In [7]:
searching_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Type of Repairer', 'Type of category', 'Type of garment in category', 'Service', 'Description', 'Price', 'Estimated time in hours', 'text', 'embeddings'],
    num_rows: 257
})

In [10]:
import pandas as pd
def search_categories(query, dataset, k=5):
    
    query_embedding = get_embeddings([query]).detach().cpu().numpy()
    
    scores, samples = searching_dataset.get_nearest_examples("embeddings", query_embedding, k=k)
    
    df = pd.DataFrame.from_dict(samples).assign(scores=scores)
    df["scores"] = scores
    columns_to_show = [
        "Type of Repairer",
        "Type of category",
        "Type of garment in category",
        "Service",
        "Description",
        "Price",
        "Estimated time in hours",
        "scores"
    ]
    
    # Return the sorted results
    return df[columns_to_show].sort_values("scores", ascending=False)

In [16]:
# Usage example:
results_df = search_categories("Only fabric", searching_dataset)
print(results_df)

  Type of Repairer Type of category Type of garment in category  \
4           Tailor          Clothes                      Skirts   
3           Tailor          Clothes                         Top   
2           Tailor          Clothes                     Dresses   
1           Tailor          Clothes                     Dresses   
0           Tailor          Clothes                      Jacket   

             Service                             Description  Price  \
4    Take out - Wais  Single layer with only existing fabric  499.0   
3           Take out                    Only existing fabric  349.0   
2   Take in - Sleeve                    Only existing fabric  369.0   
1  Take out - Sleeve                    Only existing fabric  369.0   
0           Take out                    Only existing fabric  699.0   

  Estimated time in hours     scores  
4                    None  30.558393  
3                    None  30.401035  
2                    None  29.805447  
1             