<a href="https://colab.research.google.com/github/SIDDHESH-8213/U-ASK-Winter-25/blob/main/U_ASK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from rtree import index
from collections import defaultdict
from math import sqrt
import pandas as pd


In [None]:
def load_data_from_folder(folder_path):
    dataset = []

    # ✅ Traverse all subfolders & files
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):  # Ensure only .txt files are read
                file_path = os.path.join(root, file)

                # ✅ Read and parse each file
                with open(file_path, "r") as f:
                    for line in f:
                        parts = line.strip().split()
                        obj_id = int(parts[0])
                        lat, lon = float(parts[1]), float(parts[2])
                        keyword_count = int(parts[3])

                        # Extract keyword names (skip weights)
                        keywords = [parts[i] for i in range(4, 4 + keyword_count * 2, 2)]

                        dataset.append((obj_id, lat, lon, keywords))  # ✅ Store as a list

    return dataset

# ✅ Define the path where your dataset is uploaded in Colab
dataset_folder = "/content/dataset_folder"  # Change this based on your actual path

# ✅ Load the dataset
data = load_data_from_folder(dataset_folder)

# ✅ Convert to Pandas DataFrame for easy visualization
df = pd.DataFrame(data, columns=["Object ID", "Latitude", "Longitude", "Keywords"])

# ✅ Display first 5 records
print(df.head())


Sample Data: [(100000, 43.03428148, -76.01928945, ['red', '2', 'gourmet', 'burgers']), (100001, 28.74179078, -81.33426307, ['damn']), (100002, 38.52931226, -121.43124996, ['didn', 'score', 'give', 'put']), (100003, -6.19467, 106.82103, ['pinky']), (100004, -30.0997213, -51.1593094, ['neon', 'night', 'wanna', 'party', 'lights'])]


In [None]:
spatial_idx = index.Index()

# ✅ Corrected tuple unpacking
for obj_id, lat, lon, keywords in data:
    spatial_idx.insert(obj_id, (lon, lat, lon, lat))  # Insert into R-tree

# ✅ Test: Find the nearest location to (-118.25, 34.0)
nearest = list(spatial_idx.nearest((-118.25, 34.0, -118.25, 34.0), 1))

print("Nearest location ID now:", nearest)

# ✅ Debugging: Check if spatial index has data
print(f"Total locations indexed: {len(data)}")  # Should match dataset size


Nearest location ID now: [136400]
Total locations indexed: 100000


In [None]:
keyword_index = defaultdict(list)

for obj_id, lat, lon, keywords in data:
    for keyword in keywords:
        keyword_index[keyword].append(obj_id)

#again, just for testing
print("Places that serve good burgers:", keyword_index["burgers"])


Places that serve good burgers: [100000, 107529, 123991, 125641, 130720, 133192, 137851, 141867, 151424, 152299, 155560, 156863, 176072, 176077, 195706]


In [None]:
def distance(lat1, lon1, lat2, lon2):
    """Calculate Euclidean distance (simplified for small areas)."""
    return sqrt((lat1 - lat2) ** 2 + (lon1 - lon2) ** 2)


In [None]:
def tkqn_query(lat, lon, positive_keywords, negative_keywords, k=3):
    nearby_ids = list(spatial_idx.nearest((lon, lat, lon, lat), k * 5))  # Fetch more candidates

    results = []
    for obj_id in nearby_ids:
        entry = next(d for d in data if d[0] == obj_id)
        obj_lat, obj_lon, obj_keywords = entry[1], entry[2], entry[3]

        # 🔍 Debug print to check which places are found
        print(f"DEBUG: Checking {obj_id} - Keywords: {obj_keywords}")

        # Check negative keywords
        if any(neg in obj_keywords for neg in negative_keywords):
            print(f"❌ Excluded {obj_id} (Contains {negative_keywords})")
            continue

        # Check positive keywords
        if not any(pos in obj_keywords for pos in positive_keywords):
            print(f"⚠️ Skipping {obj_id} (Doesn't contain {positive_keywords})")
            continue

        # Compute distance & add to results
        dist = distance(lat, lon, obj_lat, obj_lon)
        results.append((obj_id, dist))

    results.sort(key=lambda x: x[1])  # Sort by distance
    return results[:k]


In [None]:
top_places = tkqn_query(34.05, -118.25, ["sunset"], ["rooftop"], k=3)
print("Top places:", top_places)


DEBUG: Checking 124596 - Keywords: ['rocks', 'tasting', 'downtown', 'wine']
⚠️ Skipping 124596 (Doesn't contain ['sunset'])
DEBUG: Checking 172197 - Keywords: ['releasing', 'heart']
⚠️ Skipping 172197 (Doesn't contain ['sunset'])
DEBUG: Checking 155698 - Keywords: ['running', 'deadly', 'pull', 'cars', 'pedestrians', 'hitting', 'people', 'weapons', 'lights']
⚠️ Skipping 155698 (Doesn't contain ['sunset'])
DEBUG: Checking 141718 - Keywords: ['minds', 'simple', 'people', 'thinking']
⚠️ Skipping 141718 (Doesn't contain ['sunset'])
DEBUG: Checking 191746 - Keywords: ['trait', 'determination', 'resourcefulness']
⚠️ Skipping 191746 (Doesn't contain ['sunset'])
DEBUG: Checking 119664 - Keywords: ['reports', 'meets', 'challenges', 'year', 'spina', 'inspirational', 'girl', 'long']
⚠️ Skipping 119664 (Doesn't contain ['sunset'])
DEBUG: Checking 194619 - Keywords: ['awards', 'event']
⚠️ Skipping 194619 (Doesn't contain ['sunset'])
DEBUG: Checking 195455 - Keywords: ['concert']
⚠️ Skipping 195455 (

In [None]:
def batch_queries(queries):
    results = {}

    # ✅ Group queries based on common spatial locations to minimize redundant searches
    location_groups = {}
    for query in queries:
        lat, lon, pos_keywords, neg_keywords, k = query
        key = (lat, lon)  # Group by location

        if key not in location_groups:
            location_groups[key] = []
        location_groups[key].append((pos_keywords, neg_keywords, k))

    # ✅ Process queries in batches
    for (lat, lon), query_list in location_groups.items():
        # Find nearby locations **only once** per unique location
        nearby_ids = list(spatial_idx.nearest((lon, lat, lon, lat), max(k for _, _, k in query_list) * 5))

        # ✅ Process each query separately but reuse the nearby locations
        for pos_keywords, neg_keywords, k in query_list:
            query_results = []

            for obj_id in nearby_ids:
                entry = next(d for d in data if d[0] == obj_id)
                obj_lat, obj_lon, obj_keywords = entry[1], entry[2], entry[3]

                if any(neg in obj_keywords for neg in neg_keywords):  # Exclude negative keywords
                    continue

                if not any(pos in obj_keywords for pos in pos_keywords):  # Must match at least one positive keyword
                    continue

                # Compute distance
                dist = distance(lat, lon, obj_lat, obj_lon)
                query_results.append((obj_id, dist))

            query_results.sort(key=lambda x: x[1])
            results[(lat, lon, tuple(pos_keywords), tuple(neg_keywords), k)] = query_results[:k]

    return results


query_list = [
    (34.05, -118.25, ["sunset"], ["rooftop"], 3),
    (33.8, -118.3, ["downtown"], ["wine"], 3),
    (34.05, -118.25, ["people"], ["thinking"], 3),  # Shares location with first query
]

batch_results = batch_queries(query_list)

# ✅ Print results in a readable format
for query, result in batch_results.items():
    print(f"\nQuery: {query}")
    print(f"Results: {result}")



Query: (34.05, -118.25, ('sunset',), ('rooftop',), 3)
Results: [(120945, 0.0063323459633270445)]

Query: (34.05, -118.25, ('people',), ('thinking',), 3)
Results: [(155698, 0.0033140921305820623)]

Query: (33.8, -118.3, ('downtown',), ('wine',), 3)
Results: []
