In [None]:
!pip install pandas datasets sentence-transformers rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.9.5


In [None]:
import pandas as pd
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import process

# Hugging Face'ten veri setini yükleme
dataset = load_dataset("WhiteAngelss/magaza-urun-listesi-with-links")

# Veriyi pandas dataframe'e dönüştürme ve sütunları ayırma
df = pd.DataFrame(dataset['train'])
df[['Mağaza', 'Ürün', 'Link']] = df['Mağaza;Ürün;Link'].str.split(';', expand=True)

# Veri setinin ilk birkaç satırını görüntüleme
print(df.head())

# Ürün listesi
product_list = df['Ürün'].tolist()

# SentenceTransformer modelini yükleme
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# Ürün listesi vektörlerini hesaplama
product_embeddings = model.encode(product_list, convert_to_tensor=True)


                                   Mağaza;Ürün;Link      Mağaza        Ürün  \
0                     Nike;Top;https://www.nike.com        Nike         Top   
1                Nike;Ayakkabı;https://www.nike.com        Nike    Ayakkabı   
2                   Nike;Forma;https://www.nike.com        Nike       Forma   
3               Nike;Bilgi Yok;https://www.nike.com        Nike   Bilgi Yok   
4  MediaMarkt;Televizyon;https://www.mediamarkt.com  MediaMarkt  Televizyon   

                         Link  
0        https://www.nike.com  
1        https://www.nike.com  
2        https://www.nike.com  
3        https://www.nike.com  
4  https://www.mediamarkt.com  


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [None]:
def correct_product_combined(user_input, product_list, product_embeddings, model, top_n=5):
    # Levenshtein mesafesi ile en yakın n ürünü bulma
    best_matches = process.extract(user_input, product_list, limit=top_n)

    # En iyi n eşleşmeyi al
    top_n_products = [match[0] for match in best_matches]

    # Bu ürünlerin embeddinglerini al
    top_n_embeddings = model.encode(top_n_products, convert_to_tensor=True)

    # Kullanıcının girdisini vektörize etme
    user_input_embedding = model.encode(user_input, convert_to_tensor=True)

    # Benzerlik hesaplama
    similarities = util.pytorch_cos_sim(user_input_embedding, top_n_embeddings)

    # En benzer ürünün indeksini bulma
    best_match_idx = similarities.argmax().item()

    return top_n_products[best_match_idx]

# Örnek kullanım
user_input = "tp"
corrected_product = correct_product_combined(user_input, product_list, product_embeddings, model)
print(f"Kullanıcının girdisi: {user_input}, Düzeltilmiş hali: {corrected_product}")


Kullanıcının girdisi: tp, Düzeltilmiş hali: Tablet
