# 03_attribute_taxonomy.ipynb
## Data Preprocessing 2: Analyze and Simplify "categories" and "attributes" (dataset_business)

Extract all unique categories from pa_filtered_dining_businesses.json

In [13]:
import json
import os

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_businesses", "pa_filtered_dining_businesses.json")
output_path = os.path.join(BASE_DIR, "output_categories", "pa_unique_categories.json")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Load filtered dining businesses
with open(input_path, 'r', encoding='utf-8') as f:
    businesses = json.load(f)

# Extract unique categories
unique_categories = set()

for business in businesses:
    categories = business.get("categories", "")
    if categories:
        unique_categories.update(map(str.strip, categories.split(',')))

# Convert to sorted list
unique_categories = sorted(unique_categories)

# Save unique categories to file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(unique_categories, f, indent=2, ensure_ascii=False)

# Print results
print(f"Total unique categories extracted: {len(unique_categories)}")
print(f"Extracted categories saved to: {output_path}")

Total unique categories extracted: 484
Extracted categories saved to: d:\Programming\LLM_RS\output_categories\pa_unique_categories.json


Since there are 484 categories, using a one-hot encoding or multi-hot vector approach will result in very high-dimensional sparse vectors, making computations inefficient.

1. Remove: irrelevent categories & overly broad terms
2. Merge: Synonyms & Redundant Entries (e.g., "American (New)" and "American (Traditional)" → "American").

In [18]:
import json
import os

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_categories", "pa_unique_categories.json")
output_path = os.path.join(BASE_DIR, "output_categories", "pa_cleaned_categories.json")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Load unique categories
with open(input_path, 'r', encoding='utf-8') as f:
    unique_categories = json.load(f)

initial_count = len(unique_categories)
print(f"Initial unique categories: {initial_count}")

# Define categories to remove (Non-food-related)
irrelevant_categories = {
    "Accessories", "Accountants", "Active Life", "Acupuncture", "Adult", "Adult Education", "Adult Entertainment",
    "Airlines", "Airport Lounges", "Airports", "Amusement Parks", "Antiques", "Apartments", "Appliances",
    "Appliances & Repair", "Arcades", "Art Classes", "Art Galleries", "Art Museums", "Art Schools", "Arts & Crafts",
    "Arts & Entertainment", "Attraction Farms", "Auto Glass Services", "Auto Repair", "Automotive", "Axe Throwing",
    "Baby Gear & Furniture", "Banks & Credit Unions", "Barbers", "Barre Classes", "Bartenders", "Bartending Schools",
    "Beaches", "Beauty & Spas", "Bed & Breakfast", "Bike Rentals", "Bike Repair/Maintenance", "Bike Tours", "Bikes",
    "Boat Charters", "Boat Tours", "Boating", "Body Contouring", "Books", "Bookstores", "Botanical Gardens",
    "Bowling", "Bridal", "Business Consulting", "Candle Stores", "Cannabis Clinics", "Cannabis Dispensaries",
    "Cardio Classes", "Cards & Stationery", "Casinos", "Check Cashing/Pay-day Loans", "Children's Clothing",
    "Christmas Trees", "Cinema", "Colleges & Universities", "Colonics", "Comedy Clubs", "Comic Books",
    "Community Service/Non-Profit", "Convenience Stores", "Cooking Classes", "Cooking Schools",
    "Cosmetics & Beauty Supply", "Costumes", "Country Clubs", "Couriers & Delivery Services", "Cultural Center",
    "Dance Clubs", "Day Camps", "Day Spas", "Dentists", "Department Stores", "Dinner Theater", "Discount Store",
    "Distilleries", "Doctors", "Drugstores", "Dry Cleaning", "Dry Cleaning & Laundry", "Education",
    "Educational Services", "Electronics", "Event Planning & Services", "Eyewear & Opticians", "Farmers Market",
    "Fashion", "Festivals", "Financial Advising", "Financial Services", "Fireplace Services", "Fitness & Instruction",
    "Fitness/Exercise Equipment", "Flea Markets", "Floral Designers", "Florists", "Flowers & Gifts", "Furniture Stores",
    "Gas Stations", "General Dentistry", "Gift Shops", "Golf", "Golf Lessons", "Gyms", "Hair Salons", "Hair Stylists",
    "Head Shops", "Health & Medical", "Health Markets", "Herbal Shops", "Herbs & Spices", "Hiking",
    "Historical Tours", "Hobby Shops", "Home & Garden", "Home Decor", "Home Services", "Hotels", "Hotels & Travel",
    "Indoor Playcentre", "Internet Cafes", "Jewelry", "Kids Activities", "Kitchen & Bath", "Kitchen Supplies",
    "LAN Centers", "Landmarks & Historical Buildings", "Laundromat", "Laundry Services", "Lawyers", "Libraries",
    "Local Flavor", "Local Services", "Macarons", "Mags", "Marketing", "Mass Media", "Massage", "Massage Therapy",
    "Medical Spas", "Men's Clothing", "Mini Golf", "Mountain Biking", "Museums", "Music & DVDs", "Music & Video",
    "Music Venues", "Nail Salons", "Nail Technicians", "Naturopathic/Holistic", "Newspapers & Magazines",
    "Nightlife", "Nurseries & Gardening", "Nutritionists", "Office Equipment", "Olive Oil", "Organic Stores",
    "Outdoor Furniture Stores", "Outlet Stores", "Paint & Sip", "Parenting Classes", "Parks",
    "Party & Event Planning", "Party Equipment Rentals", "Party Supplies", "Performing Arts", "Personal Chefs",
    "Personal Shopping", "Pet Adoption", "Pet Groomers", "Pet Services", "Pet Stores", "Pet Training", "Pets",
    "Photography Stores & Services", "Physical Therapy", "Pilates", "Playgrounds", "Pool & Billiards", "Pool Halls",
    "Print Media", "Private Tutors", "Professional Services", "Public Markets", "Public Services & Government",
    "Real Estate", "Recreation Centers", "Recycling Center", "Reflexology", "Religious Organizations", "Resorts",
    "Rest Stops", "Retail", "Salon", "Saunas", "Shopping", "Shopping Centers", "Shoe Stores", "Skating Rinks",
    "Skin Care", "Social Clubs", "Souvenir Shops", "Special Education", "Specialty Schools", "Spiritual Shop",
    "Sporting Goods", "Sports Clubs", "Sports Wear", "Strip Clubs", "Summer Camps", "Swimming Pools",
    "Tabletop Games", "Tableware", "Tasting Classes", "Tea Rooms", "Team Building Activities", "Tennis",
    "Thrift Stores", "Ticket Sales", "Tiki Bars", "Tires", "Tobacco Shops", "Tours", "Toy Stores",
    "Traditional Chinese Medicine", "Trainers", "Transportation", "Travel Services", "Tutoring Centers",
    "Used", "Vape Shops", "Venues & Event Spaces", "Veterinarians", "Videos & Video Game Rental",
    "Vintage & Consignment", "Vinyl Records", "Visitor Centers", "Vitamins & Supplements", "Walking Tours",
    "Watches", "Wedding Planning", "Weight Loss Centers", "Wheel & Rim Repair", "Wholesale Stores", "Wholesalers",
    "Wigs", "Women's Clothing", "Yoga", "Zoos"
}

# Define overly broad food categories to remove
overly_broad_categories = {
    "Food", "Restaurants", "Grocery", "Specialty Food", "Ethnic Food"
}

# Define category merges (synonyms and redundant labels)
category_merges = {
    "American (New)": "American",
    "American (Traditional)": "American",
    "Bubble Tea": "Tea-Based Drinks",
    "Tea Rooms": "Tea-Based Drinks",
    "Steakhouses": "Steak",
    "BBQ": "Barbecue",
    "Fast Food": "Quick Service",
    "Food Trucks": "Quick Service",
    "Bakeries": "Desserts",
    "Ice Cream & Frozen Yogurt": "Desserts",
    "Poke": "Hawaiian",
    "Tex-Mex": "Mexican",
    "Gastropubs": "Bars",
    "Brewpubs": "Bars",
    "Cocktail Bars": "Bars",
    "Wine Bars": "Bars",
    "Juice Bars & Smoothies": "Healthy Drinks",
    "Health Markets": "Healthy Drinks",
    "Vegetarian": "Vegan",
    "Soup": "Comfort Food",
    "Salad": "Healthy Food",
    "Pizza": "Italian",
    "Dim Sum": "Chinese",
    "Noodles": "Chinese",
    "Shanghainese": "Chinese",
    "Szechuan": "Chinese",
    "Cantonese": "Chinese",
    "Tapas Bars": "Spanish",
    "Tapas/Small Plates": "Spanish"
}

# Process categories
processed_categories = {
    category_merges.get(c, c) for c in unique_categories 
    if c not in irrelevant_categories and c not in overly_broad_categories
}

# Save cleaned categories
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(sorted(processed_categories), f, indent=2, ensure_ascii=False)

# Print final stats
final_count = len(processed_categories)
removed_count = initial_count - final_count
print(f"Initial unique categories: {initial_count}")
print(f"Number of removed categories: {removed_count}")
print(f"Final cleaned categories count: {final_count}")
print(f"Extracted categories saved to: {output_path}")


Initial unique categories: 484
Initial unique categories: 484
Number of removed categories: 276
Final cleaned categories count: 208
Extracted categories saved to: d:\Programming\LLM_RS\output_categories\pa_cleaned_categories.json


Remain some catregories to be removed

1. Automatically match keywords and merge similar categories (e.g., "Bars", "Gay Bars", "Beer Bar" → "Bars").
2. Remove remaining irrelevant categories.

In [19]:
import json
import os

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_categories", "pa_cleaned_categories.json")
output_path = os.path.join(BASE_DIR, "output_categories", "pa_refined_categories.json")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Load cleaned categories
with open(input_path, 'r', encoding='utf-8') as f:
    cleaned_categories = json.load(f)

initial_count = len(cleaned_categories)

# Categories to remove (still irrelevant)
irrelevant_categories = {
    "Unofficial Yelp Events", "Yelp Events", "Pumpkin Patches", "Guest Houses", "Preschools",
    "Restaurant Supplies", "Pharmacy", "Hospitals", "Hostels", "Pick Your Own Farms"
}

# Auto-merging rules: Keywords -> Merged category
keyword_merges = {
    "bar": "Bars",
    "beer": "Bars",
    "wine": "Wine & Spirits",
    "coffee": "Coffee",
    "tea": "Tea-Based Drinks",
    "juice": "Healthy Drinks",
    "smoothie": "Healthy Drinks",
    "cocktail": "Bars",
    "pub": "Bars",
    "brew": "Bars",
    "poke": "Hawaiian",
    "sushi": "Japanese",
    "ramen": "Japanese",
    "noodle": "Asian Fusion",
    "pasta": "Italian",
    "pizza": "Italian",
    "steak": "Steak",
    "bbq": "Barbecue",
    "dim sum": "Chinese",
    "tapas": "Spanish",
    "sandwich": "Delis",
    "donut": "Desserts",
    "bakery": "Desserts",
    "gelato": "Desserts",
    "ice cream": "Desserts",
    "fast food": "Quick Service",
    "food truck": "Quick Service",
    "vegan": "Vegan & Vegetarian",
    "vegetarian": "Vegan & Vegetarian",
}

# Process categories
processed_categories = set()

for category in cleaned_categories:
    if category in irrelevant_categories:
        continue
    
    # Apply keyword merging
    merged_category = None
    for keyword, target in keyword_merges.items():
        if keyword in category.lower():
            merged_category = target
            break
    
    processed_categories.add(merged_category if merged_category else category)

# Convert to sorted list
final_categories = sorted(processed_categories)

# Save final cleaned categories
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(final_categories, f, indent=2, ensure_ascii=False)

# Print final stats
final_count = len(final_categories)
removed_count = initial_count - final_count

print(f"Initial categories count: {initial_count}")
print(f"Number of removed categories: {removed_count}")
print(f"Final categories count: {final_count}")
print(f"Final cleaned categories saved to: {output_path}")


Initial categories count: 208
Number of removed categories: 41
Final categories count: 167
Final cleaned categories saved to: d:\Programming\LLM_RS\output_categories\pa_refined_categories.json


1. Maps them into structured groups (Cuisine, Food Type, Service Type, Dietary, etc.).
2. Removes irrelevant categories.
3. Outputs a cleaned category list in JSON format

In [17]:
import json
import os

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_categories", "pa_cleaned_categories.json")
output_path = os.path.join(BASE_DIR, "output_categories", "pa_structured_categories.json")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Load cleaned categories
with open(input_path, 'r', encoding='utf-8') as f:
    cleaned_categories = json.load(f)

# Define structured category groups
cuisine_types = {
    "Chinese", "Japanese", "Mexican", "Italian", "Thai", "Indian", "Mediterranean", "French",
    "Vietnamese", "Greek", "Korean", "Brazilian", "Turkish", "Lebanese", "Ethiopian", "Spanish",
    "German", "American", "African", "Filipino", "Caribbean", "Middle Eastern", "Persian/Iranian"
}

food_types = {
    "Pizza", "Burgers", "Sushi", "Hot Dogs", "Poke", "Barbecue", "Steak", "Tacos", "Seafood",
    "Quick Service", "Dim Sum", "Breakfast & Brunch", "Desserts", "Soup",
    "Salad", "Sandwiches", "Cafes", "Bakeries", "Healthy Drinks", "Donuts"
}

service_types = {
    "Quick Service", "Buffets", "Fine Dining", "Food Trucks", "Cafes", "Casual Dining",
    "Bars", "Lounges", "Wine Bars", "Cocktail Bars", "Coffee & Tea", "Tea-Based Drinks"
}

dietary_preferences = {
    "Vegan", "Vegetarian", "Gluten-Free", "Halal", "Kosher"
}

# Initialize structured categories
structured_categories = {
    "Cuisine Types": [],
    "Food Types": [],
    "Service Types": [],
    "Dietary Preferences": [],
    "Other": []
}

# Categorize each cleaned category
for category in cleaned_categories:
    if category in cuisine_types:
        structured_categories["Cuisine Types"].append(category)
    elif category in food_types:
        structured_categories["Food Types"].append(category)
    elif category in service_types:
        structured_categories["Service Types"].append(category)
    elif category in dietary_preferences:
        structured_categories["Dietary Preferences"].append(category)
    else:
        structured_categories["Other"].append(category)  # Unclassified categories

# Save structured categories to file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(structured_categories, f, indent=2, ensure_ascii=False)

# Print results
print(f"Structured categories saved to: {output_path}")
print(f"Total categorized items: {sum(len(v) for v in structured_categories.values())}")

# Show counts for each category type
for category_type, items in structured_categories.items():
    print(f"{category_type}: {len(items)} categories")


Structured categories saved to: d:\Programming\LLM_RS\output_categories\pa_structured_categories.json
Total categorized items: 446
Cuisine Types: 23 categories
Food Types: 12 categories
Service Types: 5 categories
Dietary Preferences: 4 categories
Other: 402 categories


method 2: word embeddings

In [14]:
import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_categories", "pa_unique_categories.json")
output_path = os.path.join(BASE_DIR, "output_categories", "pa_category_embeddings.json")

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Load unique categories
with open(input_path, 'r', encoding='utf-8') as f:
    unique_categories = json.load(f)

# Load Sentence-BERT model (MiniLM)
model = SentenceTransformer('all-MiniLM-L6-v2')  # Outputs 384-dimensional vectors

# Generate embeddings for each category
category_embeddings = {category: model.encode(category).tolist() for category in unique_categories}

# Save embeddings to file
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(category_embeddings, f, indent=2, ensure_ascii=False)

# Print summary
print(f"Total categories encoded: {len(category_embeddings)}")
print(f"Category embeddings saved to: {output_path}")


Total categories encoded: 484
Category embeddings saved to: d:\Programming\LLM_RS\output_categories\pa_category_embeddings.json


In [9]:
import json
import os
from collections import defaultdict, Counter

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_businesses", "pa_filtered_dining_businesses.json")
output_path = os.path.join(BASE_DIR, "output_attributes", "attribute_analysis.json")  # Changed path

# Load businesses
with open(input_path, 'r', encoding='utf-8') as f:
    businesses = json.load(f)

# Extract attributes
attribute_counts = Counter()
attribute_values = defaultdict(set)

for business in businesses:
    attributes = business.get('attributes', {})  # Ensure attributes is a dictionary
    if not isinstance(attributes, dict):  # Handle cases where attributes is None or another type
        continue

    for attr, value in attributes.items():
        attribute_counts[attr] += 1  # Count occurrences
        attribute_values[attr].add(value)  # Track unique values

# Sort attributes by frequency
sorted_attributes = sorted(attribute_counts.items(), key=lambda x: x[1], reverse=True)

# Display top attributes
print("Top Business Attributes by Frequency:")
for attr, count in sorted_attributes[:20]:  # Show top 20 attributes
    print(f"{attr}: {count} businesses")

# Save the extracted attributes and values for deeper analysis
with open(output_path, "w", encoding="utf-8") as f:
    json.dump({"counts": attribute_counts, "values": {k: list(v) for k, v in attribute_values.items()}}, f, indent=2)

print(f"\nExtracted attributes saved to: {output_path}")


Top Business Attributes by Frequency:
RestaurantsTakeOut: 6734 businesses
BusinessAcceptsCreditCards: 6682 businesses
BusinessParking: 6678 businesses
RestaurantsDelivery: 6348 businesses
RestaurantsPriceRange2: 6113 businesses
BikeParking: 5752 businesses
OutdoorSeating: 5536 businesses
HasTV: 5307 businesses
WiFi: 5288 businesses
Ambience: 5261 businesses
RestaurantsReservations: 5209 businesses
Alcohol: 5123 businesses
Caters: 5087 businesses
RestaurantsGoodForGroups: 4905 businesses
GoodForKids: 4782 businesses
NoiseLevel: 4544 businesses
RestaurantsAttire: 4455 businesses
GoodForMeal: 4062 businesses
RestaurantsTableService: 2781 businesses
WheelchairAccessible: 2378 businesses

Extracted attributes saved to: d:\Programming\LLM_RS\output_attributes\attribute_analysis.json


Reduce dimensions:

Remove low-occurrence attributes (appear in <5% of businesses).

Group categories (e.g., some might be similar).

Normalize numerical values (e.g., review_count → log scale transformation).

Keep only decision-making features.

In [11]:
import json
import os
import ast
from collections import defaultdict

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_businesses", "pa_filtered_dining_businesses.json")
output_path = os.path.join(BASE_DIR, "output_attributes", "flattened_attribute.json")

# Load businesses
with open(input_path, 'r', encoding='utf-8') as f:
    businesses = json.load(f)

# Function to parse and normalize attribute values
def parse_value(value):
    """Convert string representations of dicts to actual dicts & normalize other values."""
    if isinstance(value, str):
        try:
            parsed_value = ast.literal_eval(value)  # Convert string to dictionary
            if isinstance(parsed_value, dict):
                return parsed_value
        except (ValueError, SyntaxError):
            pass  # Not a dictionary, keep as is
    return value  # Return raw value if not a dictionary

# Flattened attributes storage
flattened_attributes = defaultdict(set)

for business in businesses:
    attributes = business.get("attributes", {})
    if not isinstance(attributes, dict):  # Skip invalid attributes
        continue

    for attr, value in attributes.items():
        parsed_value = parse_value(value)  # Try to parse the value
        
        if isinstance(parsed_value, dict):  # Handle nested dictionaries
            for sub_attr, sub_value in parsed_value.items():
                if sub_value is None:
                    sub_value = "Unknown"  # Replace None with 'Unknown'
                elif isinstance(sub_value, bool):
                    sub_value = "True" if sub_value else "False"
                flattened_attributes[f"{attr}_{sub_attr}"].add(str(sub_value))
        else:
            # Normalize boolean and categorical values
            if parsed_value is None or parsed_value == "None":
                parsed_value = "Unknown"
            elif isinstance(parsed_value, bool):
                parsed_value = "True" if parsed_value else "False"
            elif isinstance(parsed_value, str):
                parsed_value = parsed_value.strip("u'")  # Remove Unicode markers
            
            flattened_attributes[attr].add(str(parsed_value))

# Convert sets to lists for JSON saving
cleaned_attributes = {k: list(v) for k, v in flattened_attributes.items() if v != {"Unknown"}}  # Remove all-"Unknown" attributes

# Save cleaned and structured attributes
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(cleaned_attributes, f, indent=2, ensure_ascii=False)

print(f"\nFinal cleaned attributes saved to: {output_path}")
print(f"Total unique attributes after final cleaning: {len(cleaned_attributes)}")



Final cleaned attributes saved to: d:\Programming\LLM_RS\output_attributes\flattened_attribute.json
Total unique attributes after final cleaning: 73
