### Yelp Dataset Creation:

In [None]:
#!python -m venv venv

In [1]:
import json
from pathlib import Path

BUSINESS_JSON_PATH = Path("yelp_academic_dataset_business.json")
REVIEW_JSON_PATH = Path("yelp_academic_dataset_review.json")

# --- 1. Check that files exist and show their sizes ---
print("Business file exists:", BUSINESS_JSON_PATH.exists(), "size:", BUSINESS_JSON_PATH.stat().st_size, "bytes")
print("Review file exists:  ", REVIEW_JSON_PATH.exists(), "size:", REVIEW_JSON_PATH.stat().st_size, "bytes")

# --- 2. Read and print the first 3 business records ---
print("\nFirst 3 business records:")
with BUSINESS_JSON_PATH.open("r", encoding="utf-8") as f:
    for i in range(3):
        line = f.readline()
        if not line:
            break
        data = json.loads(line)
        print(f"\nRecord {i+1}:")
        print("  business_id:", data.get("business_id"))
        print("  name       :", data.get("name"))
        print("  city       :", data.get("city"))
        print("  state      :", data.get("state"))
        print("  stars      :", data.get("stars"))
        print("  review_cnt :", data.get("review_count"))
        print("  categories :", data.get("categories"))

# --- 3. Quickly scan first 10,000 businesses to see city distribution ---
from collections import Counter

city_counter = Counter()
with BUSINESS_JSON_PATH.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 500_000:   # just a sample
            break
        data = json.loads(line)
        city = data.get("city")
        if city:
            city_counter[city] += 1

print("\nTop 15 cities in first 500,000 rows:")
for city, cnt in city_counter.most_common(15):
    print(f"{city:20s} {cnt}")


Business file exists: True size: 118863795 bytes
Review file exists:   True size: 5341868833 bytes

First 3 business records:

Record 1:
  business_id: Pns2l4eNsfO8kk83dixA6A
  name       : Abby Rappoport, LAC, CMQ
  city       : Santa Barbara
  state      : CA
  stars      : 5.0
  review_cnt : 7
  categories : Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists

Record 2:
  business_id: mpf3x-BjTdTEA3yCZrAYPw
  name       : The UPS Store
  city       : Affton
  state      : MO
  stars      : 3.0
  review_cnt : 15
  categories : Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services

Record 3:
  business_id: tUFrWirKiKi_TAnsVWINQQ
  name       : Target
  city       : Tucson
  state      : AZ
  stars      : 3.5
  review_cnt : 22
  categories : Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores

Top 15 cities in first 500,000 rows:
Philadelphia         14569
Tucson              

In [2]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd

# -----------------------------------------------------------
# Config: update these paths to where you've stored the JSONs
# -----------------------------------------------------------
BUSINESS_JSON_PATH = Path("yelp_academic_dataset_business.json")
REVIEW_JSON_PATH = Path("yelp_academic_dataset_review.json")

# We'll only keep businesses in these cities
'''BlockingIOErrorCITIES = {
    "New York",
    "Denver",
    "Miami",
    "Washington",      # how Yelp often stores DC
    "Washington, DC",  # just in case
    "Los Angeles",
}'''

CITIES = {
    "New Orleans",
    "Philadelphia",
    "Tucson",
    "Tampa",
    "Boise",
}

# How many businesses per city to keep at most
MAX_BUSINESSES_PER_CITY = 150   # tweak this if you want smaller/bigger

# Minimum quality filters
MIN_STARS = 4.0
MIN_REVIEW_COUNT = 50


def parse_categories(raw_categories):
    """Convert Yelp categories field into a list of lowercase category strings."""
    if not raw_categories:
        return []
    if isinstance(raw_categories, str):
        parts = [c.strip().lower() for c in raw_categories.split(",") if c.strip()]
        return parts
    # Already a list / array
    return [str(c).strip().lower() for c in raw_categories if str(c).strip()]


def build_tags5(categories):
    """
    Build 5D tags (nightlife, adventure, shopping, food, urban)
    based on Yelp categories.
    """
    nightlife_keywords = [
        "bars", "nightlife", "pub", "pubs", "lounges", "cocktail",
        "beer bar", "beer garden", "wine bar", "sports bar", "club", "dance club",
    ]
    adventure_keywords = [
        "hiking", "climbing", "outdoor", "outdoors", "rafting", "surf", "ski",
        "snowboard", "biking", "bike", "bikes", "zipline", "rock climbing",
        "water sports",
    ]
    shopping_keywords = [
        "shopping", "fashion", "department store", "mall", "boutiques",
        "thrift", "vintage", "jewelry", "bookstores", "shopping centers",
    ]
    food_keywords = [
        "restaurants", "food", "coffee", "tea", "cafe", "cafes", "bakeries",
        "desserts", "ice cream", "frozen yogurt", "pizza", "sushi", "steakhouse",
        "burgers", "sandwiches", "breakfast & brunch",
    ]
    urban_keywords = [
        "arts & entertainment", "museum", "museums", "landmarks & historical buildings",
        "tours", "local flavor", "parks", "park", "theater", "theatre", "stadium",
        "music venue", "cinema", "art gallery", "galleries", "cultural center",
    ]

    scores = {
        "nightlife": 0.0,
        "adventure": 0.0,
        "shopping": 0.0,
        "food": 0.0,
        "urban": 0.0,
    }

    for cat in categories:
        c = cat.lower()

        def matches_any(keywords):
            return any(k in c for k in keywords)

        if matches_any(nightlife_keywords):
            scores["nightlife"] += 1.0
        if matches_any(adventure_keywords):
            scores["adventure"] += 1.0
        if matches_any(shopping_keywords):
            scores["shopping"] += 1.0
        if matches_any(food_keywords):
            scores["food"] += 1.0
        if matches_any(urban_keywords):
            scores["urban"] += 1.0

    total = sum(scores.values())
    if total == 0:
        # fallback: treat as general urban exploration
        scores["urban"] = 1.0
        total = 1.0

    # normalize to sum to 1.0
    for k in scores:
        scores[k] = scores[k] / total

    return scores


def extract_price_level(attributes):
    """
    Extract price level from the nested attributes dict.
    Prefer RestaurantsPriceRange2 (1–4), fall back to 'Price Range' if present.
    The Yelp business JSON stores this in attributes as a string for many rows. :contentReference[oaicite:2]{index=2}
    """
    if not isinstance(attributes, dict):
        return None
    raw = attributes.get("RestaurantsPriceRange2") or attributes.get("Price Range")
    if raw is None:
        return None
    try:
        return int(str(raw).strip().strip("'\""))
    except ValueError:
        return None


def load_businesses():
    """Stream the business JSON and build a compact table for our selected cities."""
    selected = []
    per_city_counts = defaultdict(int)

    with BUSINESS_JSON_PATH.open("r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            data = json.loads(line)

            city = data.get("city")
            if city not in CITIES:
                continue

            # Keep only open businesses (optional but sensible)
            if data.get("is_open", 1) == 0:
                continue

            stars = data.get("stars", 0.0)
            review_count = data.get("review_count", 0)

            # High quality only
            if stars < MIN_STARS or review_count < MIN_REVIEW_COUNT:
                continue

            # Limit per city so dataset doesn't explode
            if per_city_counts[city] >= MAX_BUSINESSES_PER_CITY:
                continue

            categories = parse_categories(data.get("categories"))
            if not categories:
                continue

            tags5 = build_tags5(categories)

            attrs = data.get("attributes") or {}
            price_level = extract_price_level(attrs)

            record = {
                "business_id": data.get("business_id"),
                "name": data.get("name"),
                "city": city,
                "state": data.get("state"),
                "latitude": data.get("latitude"),
                "longitude": data.get("longitude"),
                "stars": stars,
                "review_count": review_count,
                "categories_raw": data.get("categories"),
                "categories": categories,
                "price_level": price_level,
                "tags5": tags5,
            }

            selected.append(record)
            per_city_counts[city] += 1

    df = pd.DataFrame(selected)
    print(f"Selected {len(df)} businesses across cities:")
    if not df.empty:
        print(df["city"].value_counts())
    return df


def load_top_reviews_for_businesses(business_ids):
    """
    Stream the reviews JSON and keep top 3 reviews per business
    sorted by (useful + funny + cool). :contentReference[oaicite:3]{index=3}
    """
    top_reviews = defaultdict(list)  # business_id -> list of (engagement, text)

    with REVIEW_JSON_PATH.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if not line.strip():
                continue
            data = json.loads(line)
            bid = data.get("business_id")
            if bid not in business_ids:
                continue

            useful = data.get("useful", 0) or 0
            funny = data.get("funny", 0) or 0
            cool = data.get("cool", 0) or 0
            engagement = useful + funny + cool

            text = data.get("text", "").strip()
            if not text:
                continue

            current = top_reviews[bid]

            if len(current) < 3:
                current.append((engagement, text))
                current.sort(key=lambda x: x[0], reverse=True)
            else:
                # if this review is more engaging than the least engaging in top 3
                if engagement > current[-1][0]:
                    current[-1] = (engagement, text)
                    current.sort(key=lambda x: x[0], reverse=True)

            # Optional progress logging if you're curious:
            # if i % 1_000_000 == 0:
            #     print(f"Processed {i} review lines...")

    # Convert to mapping of business_id -> list of texts
    top_texts = {
        bid: [t for _, t in sorted(reviews, key=lambda x: x[0], reverse=True)]
        for bid, reviews in top_reviews.items()
    }
    return top_texts


def build_places_dataset():
    """High-level function to build the final places dataset."""
    businesses_df = load_businesses()
    if businesses_df.empty:
        print("No businesses selected. Check your filters and file paths.")
        return businesses_df

    business_ids = set(businesses_df["business_id"])

    print(f"Collecting top reviews for {len(business_ids)} businesses...")
    top_reviews = load_top_reviews_for_businesses(business_ids)

    # Attach top_reviews as a list-of-strings column
    businesses_df["top_reviews"] = businesses_df["business_id"].map(
        lambda bid: top_reviews.get(bid, [])
    )

    # Explode tags5 dict into separate columns for convenience
    tags_df = businesses_df["tags5"].apply(pd.Series)
    tags_df.columns = [f"tag_{c}" for c in tags_df.columns]

    final_df = pd.concat([businesses_df.drop(columns=["tags5"]), tags_df], axis=1)

    print("Final dataset shape:", final_df.shape)
    return final_df

In [3]:

final_df = build_places_dataset()
final_df.head()


Selected 750 businesses across cities:
city
Philadelphia    150
New Orleans     150
Boise           150
Tucson          150
Tampa           150
Name: count, dtype: int64
Collecting top reviews for 750 businesses...
Final dataset shape: (750, 17)


Unnamed: 0,business_id,name,city,state,latitude,longitude,stars,review_count,categories_raw,categories,price_level,top_reviews,tag_nightlife,tag_adventure,tag_shopping,tag_food,tag_urban
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,PA,39.955505,-75.155564,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","[restaurants, food, bubble tea, coffee & tea, ...",1.0,[I really enjoyed my brief visit to St.Honore ...,0.0,0.0,0.0,1.0,0.0
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,Philadelphia,PA,39.953949,-75.143226,4.0,245,"Sushi Bars, Restaurants, Japanese","[sushi bars, restaurants, japanese]",2.0,[Finally got a chance to check this place out ...,0.333333,0.0,0.0,0.666667,0.0
2,ROeacJQwBeh05Rqg7F6TCg,BAP,Philadelphia,PA,39.943223,-75.162568,4.5,205,"Korean, Restaurants","[korean, restaurants]",1.0,[I've been told I have nice BAPs. I have no id...,0.0,0.0,0.0,1.0,0.0
3,eMjnw_7wp-CscyNh6Lu0ZA,AM&PM Locksmith,Philadelphia,PA,40.07233,-75.048483,4.5,58,"Keys & Locksmiths, Home Services, Local Services","[keys & locksmiths, home services, local servi...",,"[On a recent frigid winter morning, I realized...",0.0,0.0,0.0,0.0,1.0
4,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,New Orleans,LA,29.950647,-90.074427,4.5,350,"Nightlife, Pubs, Event Planning & Services, Wi...","[nightlife, pubs, event planning & services, w...",2.0,[Friends and I had a blast for bottomless brun...,0.833333,0.0,0.0,0.166667,0.0


In [None]:
final_df.to_csv('activities.csv', index=False)

In [4]:
import numpy as np

tag_cols = ["tag_nightlife", "tag_adventure", "tag_shopping", "tag_food", "tag_urban"]

# Check that all tag columns exist
print("Tag columns present:", all(col in final_df.columns for col in tag_cols))

# Sum of tags per row (should be ~1.0)
final_df["tag_sum"] = final_df[tag_cols].sum(axis=1)
print("Tag sum per row (should be close to 1.0):")
print(final_df["tag_sum"].describe())

# Quick look at first few rows
final_df[["name", "city"] + tag_cols + ["tag_sum"]].head()


Tag columns present: True
Tag sum per row (should be close to 1.0):
count    7.500000e+02
mean     1.000000e+00
std      1.147399e-17
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+00
Name: tag_sum, dtype: float64


Unnamed: 0,name,city,tag_nightlife,tag_adventure,tag_shopping,tag_food,tag_urban,tag_sum
0,St Honore Pastries,Philadelphia,0.0,0.0,0.0,1.0,0.0,1.0
1,Tuna Bar,Philadelphia,0.333333,0.0,0.0,0.666667,0.0,1.0
2,BAP,Philadelphia,0.0,0.0,0.0,1.0,0.0,1.0
3,AM&PM Locksmith,Philadelphia,0.0,0.0,0.0,0.0,1.0,1.0
4,Copper Vine,New Orleans,0.833333,0.0,0.0,0.166667,0.0,1.0


In [5]:
print("Overall mean tag scores (all cities):")
overall_tag_means = final_df[tag_cols].mean().sort_values(ascending=False)
display(overall_tag_means)

# Also see as percentages
overall_tag_pct = overall_tag_means / overall_tag_means.sum()
print("\nOverall tag share (%):")
display((overall_tag_pct * 100).round(1))


Overall mean tag scores (all cities):


tag_food         0.579804
tag_urban        0.205694
tag_nightlife    0.129932
tag_shopping     0.047638
tag_adventure    0.036933
dtype: float64


Overall tag share (%):


tag_food         58.0
tag_urban        20.6
tag_nightlife    13.0
tag_shopping      4.8
tag_adventure     3.7
dtype: float64

In [6]:
print("Per-city mean tag scores:")
city_tag_means = final_df.groupby("city")[tag_cols].mean()

display(city_tag_means)

# Also normalized as percentages per city
city_tag_pct = city_tag_means.div(city_tag_means.sum(axis=1), axis=0) * 100
print("\nPer-city tag share (%):")
display(city_tag_pct.round(1))


Per-city mean tag scores:


Unnamed: 0_level_0,tag_nightlife,tag_adventure,tag_shopping,tag_food,tag_urban
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boise,0.17523,0.035222,0.014111,0.660262,0.115175
New Orleans,0.130167,0.021111,0.040119,0.499865,0.308738
Philadelphia,0.104857,0.043333,0.06373,0.577802,0.210278
Tampa,0.132421,0.04,0.03937,0.63032,0.157889
Tucson,0.106984,0.045,0.080857,0.53077,0.236389



Per-city tag share (%):


Unnamed: 0_level_0,tag_nightlife,tag_adventure,tag_shopping,tag_food,tag_urban
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boise,17.5,3.5,1.4,66.0,11.5
New Orleans,13.0,2.1,4.0,50.0,30.9
Philadelphia,10.5,4.3,6.4,57.8,21.0
Tampa,13.2,4.0,3.9,63.0,15.8
Tucson,10.7,4.5,8.1,53.1,23.6


### Cities Dataset Creating:

In [7]:
cities = [
  {
    "id": "boise",
    "name": "Boise",
    "vibe_tags": {
      "nightlife": 0.18,
      "adventure": 0.04,
      "shopping": 0.01,
      "food": 0.66,
      "urban": 0.12
    }
  },
  {
    "id": "new_orleans",
    "name": "New Orleans",
    "vibe_tags": {
      "nightlife": 0.13,
      "adventure": 0.02,
      "shopping": 0.04,
      "food": 0.50,
      "urban": 0.31
    }
  },
  {
    "id": "philadelphia",
    "name": "Philadelphia",
    "vibe_tags": {
      "nightlife": 0.10,
      "adventure": 0.04,
      "shopping": 0.06,
      "food": 0.58,
      "urban": 0.21
    }
  },
  {
    "id": "tampa",
    "name": "Tampa",
    "vibe_tags": {
      "nightlife": 0.13,
      "adventure": 0.04,
      "shopping": 0.04,
      "food": 0.63,
      "urban": 0.16
    }
  },
  {
    "id": "tucson",
    "name": "Tucson",
    "vibe_tags": {
      "nightlife": 0.11,
      "adventure": 0.05,
      "shopping": 0.08,
      "food": 0.53,
      "urban": 0.24
    }
  }
]


In [8]:
# Define a numeric proxy cost per business
price_map = {1: 15, 2: 30, 3: 50, 4: 80}  # just an example
final_df["price_proxy"] = final_df["price_level"].map(price_map)

city_price_stats = final_df.groupby("city").agg(
    avg_price_proxy=("price_proxy", "mean"),
    median_price_proxy=("price_proxy", "median"),
    pct_expensive=("price_level", lambda s: (s >= 3).mean())
).reset_index()

city_price_stats


Unnamed: 0,city,avg_price_proxy,median_price_proxy,pct_expensive
0,Boise,27.554745,30.0,0.053333
1,New Orleans,29.243697,30.0,0.08
2,Philadelphia,25.53719,30.0,0.04
3,Tampa,26.753731,30.0,0.04
4,Tucson,25.646552,30.0,0.046667


In [9]:
min_cost = city_price_stats["avg_price_proxy"].min()
max_cost = city_price_stats["avg_price_proxy"].max()
city_price_stats["cost_index_0_1"] = (
    (city_price_stats["avg_price_proxy"] - min_cost) /
    (max_cost - min_cost + 1e-9)
)


In [10]:
def cost_level(x):
    if x < 0.33:
        return "low"
    elif x < 0.66:
        return "medium"
    else:
        return "high"

city_price_stats["typical_cost_level"] = city_price_stats["cost_index_0_1"].apply(cost_level)


In [11]:
for c in cities:
    row = city_price_stats[city_price_stats["city"] == c["name"]].iloc[0]
    c["avg_price_proxy"] = float(row["avg_price_proxy"])
    c["cost_index"] = float(row["cost_index_0_1"])
    c["typical_cost_level"] = row["typical_cost_level"]


In [12]:
cities

[{'id': 'boise',
  'name': 'Boise',
  'vibe_tags': {'nightlife': 0.18,
   'adventure': 0.04,
   'shopping': 0.01,
   'food': 0.66,
   'urban': 0.12},
  'avg_price_proxy': 27.554744525547445,
  'cost_index': 0.5443276450350364,
  'typical_cost_level': 'medium'},
 {'id': 'new_orleans',
  'name': 'New Orleans',
  'vibe_tags': {'nightlife': 0.13,
   'adventure': 0.02,
   'shopping': 0.04,
   'food': 0.5,
   'urban': 0.31},
  'avg_price_proxy': 29.243697478991596,
  'cost_index': 0.9999999997302043,
  'typical_cost_level': 'high'},
 {'id': 'philadelphia',
  'name': 'Philadelphia',
  'vibe_tags': {'nightlife': 0.1,
   'adventure': 0.04,
   'shopping': 0.06,
   'food': 0.58,
   'urban': 0.21},
  'avg_price_proxy': 25.537190082644628,
  'cost_index': 0.0,
  'typical_cost_level': 'low'},
 {'id': 'tampa',
  'name': 'Tampa',
  'vibe_tags': {'nightlife': 0.13,
   'adventure': 0.04,
   'shopping': 0.04,
   'food': 0.63,
   'urban': 0.16},
  'avg_price_proxy': 26.753731343283583,
  'cost_index': 0.3

In [None]:
!pip install pymongo

In [None]:
import pandas as pd

# Flatten the cities data for CSV export
cities_data = []
for city in cities:
    row = {
        'id': city.get('id'),
        'name': city.get('name'),
        'vibe_tags_nightlife': city.get('vibe_tags', {}).get('nightlife'),
        'vibe_tags_adventure': city.get('vibe_tags', {}).get('adventure'),
        'vibe_tags_shopping': city.get('vibe_tags', {}).get('shopping'),
        'vibe_tags_food': city.get('vibe_tags', {}).get('food'),
        'vibe_tags_urban': city.get('vibe_tags', {}).get('urban'),
    }
    # Add optional fields if they exist
    if 'avg_price_proxy' in city:
        row['avg_price_proxy'] = city.get('avg_price_proxy')
    if 'cost_index' in city:
        row['cost_index'] = city.get('cost_index')
    if 'typical_cost_level' in city:
        row['typical_cost_level'] = city.get('typical_cost_level')
    cities_data.append(row)

# Create DataFrame and save to CSV
cities_df = pd.DataFrame(cities_data)
cities_df.to_csv('cities.csv', index=False)
print(f"CSV file 'cities.csv' created successfully with {len(cities_df)} rows.")
print("\nFirst few rows:")
cities_df.head()


In [None]:
!pip install pandas
!pip install pymongo    
!pip install pathlib
!pip install ast
!pip install numpy

In [None]:
#import pymongo
from pymongo import MongoClient
import pandas as pd
from pathlib import Path

# MongoDB connection string - UPDATE THIS with your MongoDB cluster connection string
# Format: mongodb+srv://username:password@cluster.mongodb.net/
MONGODB_URI = "mongodb+srv://username:password@cluster.mongodb.net/"  # UPDATE THIS

# Database and collection names
DB_NAME = "users"
COLLECTION_NAME = "users"

# Connect to MongoDB
try:
    client = MongoClient(MONGODB_URI)
    db = client[DB_NAME]
    collection = db[COLLECTION_NAME]
    print("Connected to MongoDB successfully!")
    print(f"Database: {DB_NAME}, Collection: {COLLECTION_NAME}")
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    raise

# Read CSV files
activities_path = Path("activities.csv")
cities_path = Path("cities.csv")

print("\n" + "="*50)
print("Reading CSV files...")

# Read activities CSV
if activities_path.exists():
    activities_df = pd.read_csv(activities_path)
    print(f"✓ Loaded {len(activities_df)} activities from activities.csv")
    # Normalize column names (strip whitespace)
    activities_df.columns = activities_df.columns.str.strip()
    print(f"Available columns: {list(activities_df.columns)}")
    
    # Check for required tag columns
    required_tag_columns = ["tag_nightlife", "tag_adventure", "tag_shopping", "tag_food", "tag_urban"]
    missing_tags = [col for col in required_tag_columns if col not in activities_df.columns]
    if missing_tags:
        print(f"⚠ Warning: Missing tag columns in CSV: {missing_tags}")
        print("  These will be set to 0.0 in the database.")
        print("  Consider regenerating activities.csv by running the dataset creation cells.")
else:
    print(f"✗ Error: {activities_path} not found!")
    activities_df = None

# Read cities CSV
if cities_path.exists():
    cities_df = pd.read_csv(cities_path)
    print(f"✓ Loaded {len(cities_df)} cities from cities.csv")
else:
    print(f"✗ Error: {cities_path} not found!")
    cities_df = None

# Insert cities data
if cities_df is not None and not cities_df.empty:
    print("\n" + "="*50)
    print("Inserting cities data...")
    cities_documents = []
    for _, row in cities_df.iterrows():
        doc = {
            "type": "city",
            "id": str(row.get("id", "")),
            "name": str(row.get("name", "")),
            "vibe_tags": {
                "nightlife": float(row.get("vibe_tags_nightlife", 0)) if pd.notna(row.get("vibe_tags_nightlife")) else 0.0,
                "adventure": float(row.get("vibe_tags_adventure", 0)) if pd.notna(row.get("vibe_tags_adventure")) else 0.0,
                "shopping": float(row.get("vibe_tags_shopping", 0)) if pd.notna(row.get("vibe_tags_shopping")) else 0.0,
                "food": float(row.get("vibe_tags_food", 0)) if pd.notna(row.get("vibe_tags_food")) else 0.0,
                "urban": float(row.get("vibe_tags_urban", 0)) if pd.notna(row.get("vibe_tags_urban")) else 0.0,
            }
        }
        # Add optional fields if they exist
        if "avg_price_proxy" in cities_df.columns and pd.notna(row.get("avg_price_proxy")):
            doc["avg_price_proxy"] = float(row.get("avg_price_proxy"))
        if "cost_index" in cities_df.columns and pd.notna(row.get("cost_index")):
            doc["cost_index"] = float(row.get("cost_index"))
        if "typical_cost_level" in cities_df.columns and pd.notna(row.get("typical_cost_level")):
            doc["typical_cost_level"] = str(row.get("typical_cost_level"))
        cities_documents.append(doc)
    
    if cities_documents:
        result = collection.insert_many(cities_documents)
        print(f"✓ Inserted {len(result.inserted_ids)} city documents")
    else:
        print("✗ No city documents to insert")
else:
    print("\n✗ Skipping cities insertion - no data available")

# Insert activities data
if activities_df is not None and not activities_df.empty:
    print("\n" + "="*50)
    print("Inserting activities data...")
    activities_documents = []
    
    for _, row in activities_df.iterrows():
        doc = {
            "type": "activity",
            "business_id": str(row.get("business_id", "")) if pd.notna(row.get("business_id")) else None,
            "name": str(row.get("name", "")) if pd.notna(row.get("name")) else None,
            "city": str(row.get("city", "")) if pd.notna(row.get("city")) else None,
            "state": str(row.get("state", "")) if pd.notna(row.get("state")) else None,
            "latitude": float(row.get("latitude")) if pd.notna(row.get("latitude")) else None,
            "longitude": float(row.get("longitude")) if pd.notna(row.get("longitude")) else None,
            "stars": float(row.get("stars")) if pd.notna(row.get("stars")) else None,
            "review_count": int(row.get("review_count")) if pd.notna(row.get("review_count")) else None,
        }
        
        # Handle categories
        if "categories_raw" in activities_df.columns and pd.notna(row.get("categories_raw")):
            doc["categories_raw"] = str(row.get("categories_raw"))
        if "categories" in activities_df.columns:
            categories = row.get("categories")
            if isinstance(categories, str):
                # Try to parse if it's a string representation of a list
                try:
                    import ast
                    doc["categories"] = ast.literal_eval(categories)
                except:
                    doc["categories"] = [categories]
            elif isinstance(categories, list):
                doc["categories"] = categories
            else:
                doc["categories"] = []
        else:
            doc["categories"] = []
        
        # Handle price_level
        if "price_level" in activities_df.columns and pd.notna(row.get("price_level")):
            doc["price_level"] = float(row.get("price_level"))
        
        # Handle top_reviews
        if "top_reviews" in activities_df.columns:
            top_reviews = row.get("top_reviews")
            if isinstance(top_reviews, str):
                try:
                    import ast
                    doc["top_reviews"] = ast.literal_eval(top_reviews)
                except:
                    doc["top_reviews"] = [top_reviews] if top_reviews else []
            elif isinstance(top_reviews, list):
                doc["top_reviews"] = top_reviews
            else:
                doc["top_reviews"] = []
        else:
            doc["top_reviews"] = []
        
        # Handle tag columns - use safe access with defaults
        tag_columns = ["tag_nightlife", "tag_adventure", "tag_shopping", "tag_food", "tag_urban"]
        for tag_col in tag_columns:
            try:
                # Try to get the value, with multiple fallback methods
                if tag_col in activities_df.columns:
                    value = row.get(tag_col, 0.0)
                    if pd.notna(value):
                        doc[tag_col] = float(value)
                    else:
                        doc[tag_col] = 0.0
                else:
                    # Column doesn't exist, set default
                    doc[tag_col] = 0.0
            except (KeyError, AttributeError) as e:
                # If any error occurs, set default value
                print(f"Warning: Could not access {tag_col}, using default 0.0. Error: {e}")
                doc[tag_col] = 0.0
        
        # Handle price_proxy if it exists
        if "price_proxy" in activities_df.columns and pd.notna(row.get("price_proxy")):
            doc["price_proxy"] = float(row.get("price_proxy"))
        
        activities_documents.append(doc)
    
    if activities_documents:
        # Insert in batches to avoid memory issues with large datasets
        batch_size = 1000
        total_inserted = 0
        for i in range(0, len(activities_documents), batch_size):
            batch = activities_documents[i:i + batch_size]
            result = collection.insert_many(batch)
            total_inserted += len(result.inserted_ids)
            print(f"✓ Inserted batch {i//batch_size + 1}: {len(result.inserted_ids)} documents")
        print(f"✓ Total inserted {total_inserted} activity documents")
    else:
        print("✗ No activity documents to insert")
else:
    print("\n✗ Skipping activities insertion - no data available")

# Print summary
print("\n" + "="*50)
print("Insertion Summary:")
print(f"Total documents in collection: {collection.count_documents({})}")
print(f"City documents: {collection.count_documents({'type': 'city'})}")
print(f"Activity documents: {collection.count_documents({'type': 'activity'})}")
print("="*50)

# Close connection
client.close()
print("\n✓ MongoDB connection closed.")
