In [1]:
!pip install  sentence_transformers elasticsearch==9.0.0 emoji



In [1]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from datetime import datetime
import pandas as pd
from tqdm import tqdm

# Initialize Elasticsearch
es = Elasticsearch("https://faca-2402-800-6371-866a-b9d8-23af-c1d6-dde9.ngrok-free.app")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [15]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import json
index_name = "games"

# --- Vector Encoder ---
def encode_query(text):
    return model.encode(text).tolist()

# --- Search Functions ---
# --- Search Functions ---
def search_bm25(query_text, size=100, filtered_ids=None):
        bm25_query = {
            "size": size,
            "query": {
                "bool": {"must": [{"match": {"detailed_description": query_text}}]}
            },
        }

        if filtered_ids:
            bm25_query["query"]["bool"]["filter"] = {
                "ids": {"values": list(filtered_ids)}
            }

        res = es.search(index=index_name, body=bm25_query)
        return {hit["_id"]: hit["_score"] for hit in res["hits"]["hits"]}

def search_vector(query_vector, k=100, candidates=200, filtered_ids=None):
        if filtered_ids:
            # Fallback: manually filter after search (ES does not support knn+ids directly)
            vector_query = {
                "knn": {
                    "field": "description_vector",
                    "query_vector": query_vector,
                    "k": candidates,
                    "num_candidates": candidates,
                }
            }
            res = es.search(index=index_name, body=vector_query)
            return {
                hit["_id"]: hit["_score"]
                for hit in res["hits"]["hits"]
                if hit["_id"] in filtered_ids
            }

        else:
            vector_query = {
                "knn": {
                    "field": "description_vector",
                    "query_vector": query_vector,
                    "k": k,
                    "num_candidates": candidates,
                }
            }
            res = es.search(index=index_name, body=vector_query)
            return {hit["_id"]: hit["_score"] for hit in res["hits"]["hits"]}
# --- Normalize Scores ---
def normalize(scores_dict):
    if not scores_dict:  # Check if the dictionary is empty
        return {}  # Return an empty dictionary if it is

    ids = list(scores_dict.keys())
    scores = np.array(list(scores_dict.values())).reshape(-1, 1)
    normalized = MinMaxScaler().fit_transform(scores).flatten()
    return dict(zip(ids, normalized))

def combine_scores(bm25_scores, vec_scores, weight_bm25=0.7, weight_vec=0.3):
    norm_bm25 = normalize(bm25_scores)
    norm_vec = normalize(vec_scores)

    # Check if both dictionaries are empty after normalization
    if not norm_bm25 and not norm_vec:
        return {}  # Return an empty dictionary if both are empty

    combined = {}
    for _id in set(norm_bm25.keys()).union(norm_vec.keys()):
        bm25 = norm_bm25.get(_id, 0)
        vec = norm_vec.get(_id, 0)
        combined[_id] = weight_bm25 * bm25 + weight_vec * vec
    return combined

# --- Final Hybrid Search ---
import json

def hybrid_search(query_text, top_k=10, filtered_ids=None):
    if not query_text:
        return json.dumps([], ensure_ascii=False)

    print(f"\nSearching for: {query_text}")
    print(f"Filtered IDs: {filtered_ids}")

    query_vector = encode_query(query_text)
    bm25_scores = search_bm25(query_text, filtered_ids=filtered_ids)
    vec_scores = search_vector(query_vector, filtered_ids=filtered_ids)

    # Modified score combination to be more lenient
    combined = {}
    for _id in filtered_ids:
        bm25_score = bm25_scores.get(_id, 0)
        vec_score = vec_scores.get(_id, 0)
        # If either score exists, use it
        if bm25_score > 0 or vec_score > 0:
            combined[_id] = max(bm25_score, vec_score)  # Take the better of the two scores

    # If we have less than 10 results with scores, add the remaining filtered IDs
    if len(combined) < 10:
        remaining_ids = set(filtered_ids) - set(combined.keys())
        for _id in remaining_ids:
            combined[_id] = 0.0  # Add remaining IDs with zero score

    # Sort results: first by score (descending), then by ID (ascending)
    top_results = sorted(combined.items(),
                        key=lambda x: (-x[1], int(x[0])))[:top_k]

    print("\nTop Results:")
    for _id, score in top_results:
        print(f"ID: {_id}, Final Score: {score}")

    results = []
    for rank, (_id, score) in enumerate(top_results, start=1):
        try:
            doc = es.get(index=index_name, id=_id)
            source = doc["_source"]

            filtered_source = {
                k: v for k, v in source.items()
                if k not in ["cluster", "description_vector"]
            }

            result = {
                "rank": rank,
                "score": round(score, 2),
                **filtered_source
            }

            results.append(result)
        except Exception as e:
            print(f"Error retrieving document {_id}: {str(e)}")
            continue

    return json.dumps(results, indent=2, ensure_ascii=False)
# --- Direct Search by Name ---
def direct_elastic(name):
        # Using match query with fuzziness for more flexible name matching
        query = {
            "query": {
                "match": {
                    "name": {
                        "query": name,
                        "fuzziness": "AUTO",  # Automatically adjust fuzziness based on term length
                        "prefix_length": 2,    # First 2 characters must match exactly
                        "operator": "or"       # Match any of the terms
                    }
                }
            },
            "size": 10  # Limit results to top 10 matches
        }

        res = es.search(index=index_name, body=query)

        results = []
        for hit in res["hits"]["hits"]:
            source = hit["_source"]
            filtered_source = {
                k: v
                for k, v in source.items()
                if k not in ["cluster", "description_vector"]
            }
            # Add relevance score to the result
            filtered_source["relevance_score"] = round(hit["_score"], 2)
            results.append(filtered_source)

        return json.dumps(results, ensure_ascii=False)


# --- Optional Filter Function ---
def filter_elastic( filters):
        query = {
            "query": {
                "bool": {
                    "must": []
                }
            }
        }
        # Age Limit Filter
        if filters.get("age_limit"):
            try:
              age_limit = float(filters["age_limit"])
              query["query"]["bool"]["must"].append({
                  "range": {
                      "required_age": {
                          "gte": age_limit
                      }
                  }
              })
            except ValueError:
                pass
        # Genre Filter - Changed to match with fuzziness
        if filters.get("genre"):
            query["query"]["bool"]["must"].append({
                "match": {
                    "genres": {
                        "query": filters["genre"],
                        "fuzziness": "AUTO",
                        "prefix_length": 2,
                        "operator": "or"
                    }
                }
            })

        # Category Filter - Changed to match with fuzziness
        if filters.get("category"):
            query["query"]["bool"]["must"].append({
                "match": {
                    "categories": {
                        "query": filters["category"],
                        "fuzziness": "AUTO",
                        "prefix_length": 2,
                        "operator": "or"
                    }
                }
            })
        # Year Range Filter (keep as is since it's a range query)
        if filters.get("year_range"):
            year_range = filters["year_range"]
            if isinstance(year_range, str) and "-" in year_range:
                try:
                    start_year, end_year = map(int, year_range.split("-"))
                except ValueError:
                    start_year = end_year = int(year_range)
            else:
                start_year = end_year = int(year_range)

            query["query"]["bool"]["must"].append({
                "range": {
                    "release_date": {
                        "gte": f"{start_year}-01-01",
                        "lte": f"{end_year}-12-31"
                    }
                }
            })

        # Developer Filter - Changed back to term for exact match
        if filters.get("developer"):
            query["query"]["bool"]["must"].append({
                "term": {
                    "developers.keyword": filters["developer"]  # Using .keyword for exact match
                }
            })

        # Publisher Filter - Changed back to term for exact match
        if filters.get("publisher"):
            query["query"]["bool"]["must"].append({
                "term": {
                    "publishers.keyword": filters["publisher"]  # Using .keyword for exact match
                }
            })

        # Platform Filter (keep as term since it's a boolean)
        if filters.get("platform"):
            platform_field = f"platforms_{filters['platform'].lower()}"
            query["query"]["bool"]["must"].append({
                "term": {
                    platform_field: True
                }
            })

        # Currency Filter - Changed to term for exact match
        if filters.get("currency"):
            query["query"]["bool"]["must"].append({
                "term": {
                    "price_currency.keyword": filters["currency"]  # Using .keyword for exact match
                }
            })

        # Price Limit (keep as is since it's a range query)
        if filters.get("price_limit"):
            try:
                price = float(filters["price_limit"])
                query["query"]["bool"]["must"].append({
                    "range": {
                        "price_final": {
                            "gte": price
                        }
                    }
                })
            except ValueError:
                pass

        # Language Filter - Changed to term for exact match
        if filters.get("language"):
            languages = [lang.strip() for lang in filters["language"].split(",")]
            field_name = "supported_languages.keyword"  # Always target keyword subfield

            if len(languages) == 1:
                query["query"]["bool"]["must"].append({
                    "term": {
                        field_name: languages[0]
                    }
                })
            else:
                query["query"]["bool"]["must"].append({
                    "terms": {
                        field_name: languages
                    }
                })



        query["size"] = 1000
        print(json.dumps(query, indent=2))
        res = es.search(index=index_name, body=query)

        return [hit["_id"] for hit in res["hits"]["hits"]]


def get_games(results):
    func_name = results.get("function_name")
    if func_name == "direct_search":
        name = results.get("game_name")
        return direct_elastic(name)
    elif func_name == "filter_search":
        result_filter = filter_elastic(results)
        print(result_filter)

        # If there's a game description, perform hybrid search
        if results.get("game_description"):
            query_text = results.get("game_description")
            return hybrid_search(query_text, filtered_ids=result_filter)
        else:
            # If no game description, return filtered results directly
            filtered_results = []
            for _id in result_filter:
                doc = es.get(index=index_name, id=_id)
                source = doc["_source"]
                filtered_source = {
                    k: v for k, v in source.items()
                    if k not in ["cluster", "description_vector"]
                }
                filtered_results.append(filtered_source)

            return json.dumps(filtered_results, indent=2, ensure_ascii=False)
    else:
        return results.get("response")

In [4]:
!pip install dotenv google



In [3]:
from LLM_Response import LLMResponse
from Hybrid_Search import HybridSearch
from google import genai
import os
from dotenv import load_dotenv
load_dotenv('api.env')
api_key = os.environ.get('gemini_api_key')
genai_client =genai.Client(api_key=api_key)
llm=LLMResponse(api_key)

In [17]:
query_text="i want to play a racing game with combat and skills age above 13"
result=llm.get_function_call(query_text)
print("Filter Result:",result)

Function to call: filter_search
Arguments: {'age_limit': '13', 'genre': 'Racing', 'game_description': 'combat and skills'}
Filter Result: {'function_name': 'filter_search', 'game_description': 'combat and skills', 'developer': None, 'publisher': None, 'year_range': None, 'price_limit': None, 'genre': 'Racing', 'category': None, 'language': None, 'age_limit': '13', 'platform': None, 'currency': None}


In [18]:
print(result)
game=get_games(result)
print(game)

{'function_name': 'filter_search', 'game_description': 'combat and skills', 'developer': None, 'publisher': None, 'year_range': None, 'price_limit': None, 'genre': 'Racing', 'category': None, 'language': None, 'age_limit': '13', 'platform': None, 'currency': None}
{
  "query": {
    "bool": {
      "must": [
        {
          "range": {
            "required_age": {
              "gte": 13.0
            }
          }
        },
        {
          "match": {
            "genres": {
              "query": "Racing",
              "fuzziness": "AUTO",
              "prefix_length": 2,
              "operator": "or"
            }
          }
        }
      ]
    }
  },
  "size": 1000
}
['216', '5123']

Searching for: combat and skills
Filtered IDs: ['216', '5123']

Top Results:
ID: 5123, Final Score: 0.17939782
ID: 216, Final Score: 0.16563582
[
  {
    "rank": 1,
    "score": 0.18,
    "app_id": 3240220,
    "name": "Grand Theft Auto V Enhanced",
    "type": "game",
    "required_age":