In [None]:
pip install spacy nltk
python -m spacy download en_core_web_sm

python -c "import nltk; nltk.download('stopwords')"

In [9]:
import hashlib
import spacy
from nltk.corpus import stopwords
import re

# Load English NLP model
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Synonym mapping (extendable)
synonym_map = {
    "show": "get",
    "list": "get",
    "who are": "get",
    "selling more": "top",
    "performing well": "top",
    "ten": "10",
    "best": "top",
    "give": "get"
}



def normalize_query(user_query):
    """Normalize user queries to generate a unique cache key."""
    
    # Convert to lowercase
    query = user_query.lower()
    
    # Apply synonym mapping
    for key, value in synonym_map.items():
        query = query.replace(key, value)

    # Remove special characters and extra spaces
    query = re.sub(r"[^\w\s]", "", query).strip()

    # Tokenize and lemmatize
    doc = nlp(query)
    keywords = [token.lemma_ for token in doc if token.text not in stop_words and token.is_alpha]

    # Sort keywords to maintain consistency
    normalized = "_".join(sorted(set(keywords)))

    # Generate a unique cache key (hashing ensures consistent lookup)
    cache_key = hashlib.md5(normalized.encode()).hexdigest()

    return cache_key, normalized

# Example Queries
queries = [
    "Give me top 10 retailers",
    "show me top 10 retailers",
    "list top 10 retailers",
    "who are the 10 retailers selling more",
    "10 retailers in top list",
    "ten retailers performing well",
    "best 10 sellers in the market"
]

# Normalize all queries
for q in queries:
    print(f"Original: {q} \nNormalized Key: {normalize_query(q)}\n")


Original: Give me top 10 retailers 
Normalized Key: ('f3a71e3238cac77be6040ad8267f8630', 'get_retailer_top')

Original: show me top 10 retailers 
Normalized Key: ('f3a71e3238cac77be6040ad8267f8630', 'get_retailer_top')

Original: list top 10 retailers 
Normalized Key: ('f3a71e3238cac77be6040ad8267f8630', 'get_retailer_top')

Original: who are the 10 retailers selling more 
Normalized Key: ('f3a71e3238cac77be6040ad8267f8630', 'get_retailer_top')

Original: 10 retailers in top list 
Normalized Key: ('f3a71e3238cac77be6040ad8267f8630', 'get_retailer_top')

Original: ten retailers performing well 
Normalized Key: ('100ff4454fff20c53112c4195d8aa809', 'retailer_top')

Original: best 10 sellers in the market 
Normalized Key: ('7c155639f72671438f9ed7a7878bd0ce', 'market_seller_top')



In [8]:
import hashlib
import spacy
from nltk.corpus import stopwords
import re

# Load English NLP model
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Synonym mapping (extendable)
synonym_map = {
    "show": "get",
    "list": "get",
    "who are": "get",
    "selling more": "top",
    "performing well": "top",
    "ten": "10",
    "best": "top",
}

def normalize_query(user_query):
    """Normalize user queries while keeping numerics for caching."""
    
    # Convert to lowercase
    query = user_query.lower()
    
    # Apply synonym mapping
    for key, value in synonym_map.items():
        query = query.replace(key, value)

    # Extract numbers separately
    numbers = re.findall(r'\d+', query)  # Finds all numeric values
    
    # Remove special characters except numbers
    query = re.sub(r"[^\w\s]", "", query).strip()

    # Tokenize and lemmatize
    doc = nlp(query)
    keywords = [token.lemma_ for token in doc if token.text not in stop_words and (token.is_alpha or token.is_digit)]

    # Combine keywords and numbers
    keywords += numbers  # Ensure numbers are included in the key

    # Sort keywords to maintain consistency
    normalized = "_".join(sorted(set(keywords)))

    # Generate a unique cache key (hashing ensures consistent lookup)
    cache_key = hashlib.md5(normalized.encode()).hexdigest()

    return cache_key, normalized

# Example Queries
queries = [
    "Give me top 10 retailers",
    "show me top 10 retailers",
    "list top 10 retailers",
    "who are the 10 retailers selling more",
    "10 retailers in top list",
    "ten retailers performing well",
    "best 5 sellers in the market",
    "show top 20 products",
]

# Normalize all queries
for q in queries:
    print(f"Original: {q} \nNormalized Key: {normalize_query(q)}\n")


Original: Give me top 10 retailers 
Normalized Key: ('f6203084ae20217a004ab068b08b25aa', '10_give_retailer_top')

Original: show me top 10 retailers 
Normalized Key: ('da6fa66d4c05a94c79031eca0439ed24', '10_get_retailer_top')

Original: list top 10 retailers 
Normalized Key: ('da6fa66d4c05a94c79031eca0439ed24', '10_get_retailer_top')

Original: who are the 10 retailers selling more 
Normalized Key: ('da6fa66d4c05a94c79031eca0439ed24', '10_get_retailer_top')

Original: 10 retailers in top list 
Normalized Key: ('da6fa66d4c05a94c79031eca0439ed24', '10_get_retailer_top')

Original: ten retailers performing well 
Normalized Key: ('dcc1d7d8a4e2cedb43d11b51447369bf', '10_retailer_top')

Original: best 5 sellers in the market 
Normalized Key: ('6ad3bd4943ed3bf4f02496b04d694cee', '5_market_seller_top')

Original: show top 20 products 
Normalized Key: ('7e32b3d15c33c850202a3e46176e0eba', '20_get_product_top')



In [10]:
import spacy
import re
import hashlib
from nltk.corpus import stopwords

# Load NLP model
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

# Synonym mapping (extendable)
synonym_map = {
    "show": "give",
    "list": "give",
    "who are": "give",
    "selling more": "top",
    "performing well": "top",
    "ten": "10",
    "best": "top",
}

def normalize_query(user_query):
    """Normalize user queries while preserving order for LLM SQL generation."""
    
    # Convert to lowercase
    query = user_query.lower()

    # Apply synonym mapping
    for key, value in synonym_map.items():
        query = query.replace(key, value)

    # Extract numbers separately
    numbers = re.findall(r'\d+', query)  # Finds all numeric values

    # Tokenize and lemmatize
    doc = nlp(query)
    keywords = [
        token.lemma_ for token in doc 
        if token.text not in stop_words or token.text in ["give", "top"]  # Keep essential words
    ]

    # Merge back numbers in the same position
    final_query = []
    num_index = 0

    for word in keywords:
        if word.isdigit():
            final_query.append(numbers[num_index])  # Keep original numeric position
            num_index += 1
        else:
            final_query.append(word)

    # Join words in proper order
    normalized = " ".join(final_query)

    # Generate a unique cache key
    cache_key = hashlib.md5(normalized.encode()).hexdigest()

    return cache_key, normalized

# Example Queries
queries = [
    "Give me top 10 retailers",
    "show me top 10 retailers",
    "list top 10 retailers",
    "who are the 10 retailers selling more",
    "10 retailers in top list",
    "ten retailers performing well",
    "best 5 sellers in the market",
    "show top 20 products",
]

# Normalize all queries
for q in queries:
    print(f"Original: {q} \nNormalized Query: {normalize_query(q)[1]}\n")


Original: Give me top 10 retailers 
Normalized Query: give top 10 retailer

Original: show me top 10 retailers 
Normalized Query: give top 10 retailer

Original: list top 10 retailers 
Normalized Query: give top 10 retailer

Original: who are the 10 retailers selling more 
Normalized Query: give 10 retailer top

Original: 10 retailers in top list 
Normalized Query: 10 retailer top give

Original: ten retailers performing well 
Normalized Query: 10 retailer top

Original: best 5 sellers in the market 
Normalized Query: top 5 seller market

Original: show top 20 products 
Normalized Query: give top 20 product

