In [480]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from scipy.sparse import hstack  # Import hstack

fake = Faker()
desired_count_per_item = 10
food_categories = {
    'Produce': ['Fruits', 'Vegetables', 'Herbs'],
    'Dairy & Eggs': ['Milk', 'Cheese', 'Yogurt', 'Eggs'],
    'Meat & Seafood': ['Beef', 'Chicken', 'Pork', 'Fish', 'Shellfish', 'Mutton'],
    'Pantry': ['Grains', 'Pasta', 'Canned Goods', 'Baking', 'Spices', 'Condiments', 'Snacks', 'Oils & Vinegars'],
    'Bakery': ['Bread', 'Pastries', 'Desserts'],
    'Beverages': ['Juice', 'Soda', 'Water', 'Coffee', 'Tea'],
    'Frozen': ['Frozen Meals', 'Frozen Vegetables', 'Frozen Fruits', 'Ice Cream'],
    'Prepared Foods': ['Deli Meats', 'Salads', 'Soups', 'Ready-to-Eat Meals'],
    'Fast Food': ['Burgers', 'Fries', 'Pizza', 'Tacos', 'Sandwiches', 'Fried Chicken', 'Other Fast Food'],
    'Indian': ['Curries', 'Rotis', 'Rice Dishes', 'Snacks']
}

produce_items = {
    'Fruits': ['Apple', 'Banana', 'Orange', 'Grapes', 'Strawberry', 'Blueberry', 'Mango', 'Pineapple', 'Watermelon', 'Peach'],
    'Vegetables': ['Carrot', 'Broccoli', 'Spinach', 'Tomato', 'Potato', 'Onion', 'Lettuce', 'Cucumber', 'Pepper', 'Zucchini'],
    'Herbs': ['Basil', 'Cilantro', 'Parsley', 'Mint', 'Rosemary', 'Thyme', 'Oregano', 'Dill']
}

dairy_eggs_items = {
    'Milk': ['Whole Milk', '2% Milk', 'Skim Milk', 'Almond Milk', 'Soy Milk', 'Oat Milk'],
    'Cheese': ['Cheddar Cheese', 'Mozzarella Cheese', 'Parmesan Cheese', 'Swiss Cheese', 'Feta Cheese', 'Brie Cheese'],
    'Yogurt': ['Greek Yogurt', 'Plain Yogurt', 'Fruit Yogurt', 'Vanilla Yogurt'],
    'Eggs': ['Eggs', "Duck Eggs", 'Brown Eggs', 'White Eggs']
}

meat_seafood_items = {
    'Beef': ['Ground Beef', 'Steak', 'Roast Beef', 'Beef Ribs'],
    'Chicken': ['Chicken Breast', 'Chicken Thighs', 'Chicken Wings', 'Whole Chicken', 'Chicken Curry'],
    'Pork': ['Pork Chops', 'Bacon', 'Ham', 'Pork Sausage'],
    'Fish': ['Salmon', 'Tuna', 'Cod', 'Tilapia', 'Trout'],
    'Shellfish': ['Shrimp', 'Crab', 'Lobster', 'Clams', 'Oysters'],
    'Mutton': ['Mutton Chops', 'Mutton Curry', 'Mutton Roast', 'Ground Mutton']
}

pantry_items = {
    'Grains': ['Rice', 'Quinoa', 'Oats', 'Barley', 'Cornmeal'],
    'Pasta': ['Spaghetti', 'Penne Pasta', 'Fusilli', 'Macaroni', 'Lasagna', 'White Sauce Pasta', 'Red Sauce Pasta'],
    'Canned Goods': ['Canned Tomatoes', 'Canned Beans', 'Canned Tuna', 'Canned Soup'],
    'Baking': ['Flour', 'Sugar', 'Baking Soda', 'Baking Powder', 'Yeast'],
    'Spices': ['Salt', 'Pepper', 'Garlic Powder', 'Onion Powder', 'Cumin', 'Paprika'],
    'Condiments': ['Ketchup', 'Mustard', 'Mayonnaise', 'Soy Sauce', 'Hot Sauce'],
    'Snacks': ['Chips', 'Crackers', 'Popcorn', 'Pretzels', 'Nuts'],
    'Oils & Vinegars': ['Olive Oil', 'Vegetable Oil', 'Vinegar', 'Balsamic Vinegar']
}

bakery_items = {
    'Bread': ['White Bread', 'Wheat Bread', 'Sourdough Bread', 'Baguette', 'Rolls'],
    'Pastries': ['Croissant', 'Muffin', 'Danish', 'Donut', 'Scone'],
    'Desserts': ['Cake', 'Pie', 'Cookies', 'Brownies', 'Cheesecake']
}

beverages_items = {
    'Juice': ['Orange Juice', 'Apple Juice', 'Grape Juice', 'Cranberry Juice'],
    'Soda': ['Cola', 'Lemon-Lime Soda', 'Orange Soda', 'Ginger Ale'],
    'Water': ['Bottled Water', 'Sparkling Water', 'Flavored Water'],
    'Coffee': ['Ground Coffee', 'Coffee Beans', 'Instant Coffee'],
    'Tea': ['Black Tea', 'Green Tea', 'Herbal Tea', 'Iced Tea']
}

frozen_items = {
    'Frozen Meals': ['Frozen Pizza', 'Frozen Lasagna', 'Frozen Burritos', 'Frozen Dinners'],
    'Frozen Vegetables': ['Frozen Peas', 'Frozen Corn', 'Frozen Broccoli', 'Frozen Spinach'],
    'Frozen Fruits': ['Frozen Strawberries', 'Frozen Blueberries', 'Frozen Mango', 'Frozen Pineapple'],
    'Ice Cream': ['Vanilla Ice Cream', 'Chocolate Ice Cream', 'Strawberry Ice Cream', 'Ice Cream Sandwiches']
}

prepared_foods_items = {
   'Deli Meats': ['Ham', 'Turkey', 'Roast Beef', 'Salami', "Bologna"],
    'Salads': ['Potato Salad', 'Pasta Salad', 'Coleslaw', 'Green Salad'],
    'Soups': ['Chicken Noodle Soup', 'Tomato Soup', 'Clam Chowder', 'Vegetable Soup'],
    'Ready-to-Eat Meals': ['Rotisserie Chicken', 'Sushi', 'Sandwiches', 'Burritos']
}

fast_food_items = {
    'Burgers': ['Cheeseburger', 'Hamburger', 'Bacon Burger', 'Veggie Burger'],
    'Fries': ['French Fries', 'Curly Fries', 'Waffle Fries', 'Sweet Potato Fries'],
    'Pizza': ['Pepperoni Pizza', 'Cheese Pizza', 'Veggie Pizza', 'Margherita Pizza'],
    'Tacos': ['Beef Tacos', 'Chicken Tacos', 'Fish Tacos', 'Veggie Tacos'],
    'Sandwiches': ['Chicken Sandwich', 'Sub Sandwich', 'Grilled Cheese Sandwich'],
    'Fried Chicken': ['Fried Chicken Bucket', 'Chicken Nuggets', 'Chicken Tenders', 'Fried Chicken'],
    'Other Fast Food': ['Hot Dog', 'Corn Dog', 'Onion Rings', 'Milkshake']
}
indian_food_items = {
    'Curries': [
        'Paneer Tikka Masala', 'Chana Masala','Paneer Butter Masala',
        'Dal Makhani', 'Palak Paneer', 'Vegetable Korma', 'Lamb Rogan Josh', 'Fish Curry', 'Aloo Gobi', 'Kadhai Paneer'
    ],
    'Rotis': ['Naan', 'Garlic Naan', 'Butter Naan', 'Tandoori Roti', 'Butter Roti', 'Lachha Paratha', 'Egg Paratha',
              'Roti', 'Paratha', 'Aloo Paratha', 'Paneer Paratha'],
    'Rice Dishes': ['Biryani', 'Chicken Biryani', 'Vegetable Biryani', 'Mutton Biryani','Jeera Rice', 'Plain Rice'],
    'Snacks': ['Samosa', 'Pakora', 'Vada Pav', 'Dosa', 'Idli', 'Pani Puri']
}
all_food_items = {
    'Produce': produce_items,
    'Dairy & Eggs': dairy_eggs_items,
    'Meat & Seafood': meat_seafood_items,
    'Pantry': pantry_items,
    'Bakery': bakery_items,
    'Beverages': beverages_items,
    'Frozen': frozen_items,
    'Prepared Foods': prepared_foods_items,
    'Fast Food': fast_food_items,
    'Indian': indian_food_items
}

data_food = []

for category, subcategories in all_food_items.items():
    for subcategory, items in subcategories.items():
        for item in items:
            prices = np.round(np.random.gamma(2, 1.5, desired_count_per_item) * 2, 2)
            if category == 'Fast Food':
                prices *= 0.7
            elif category == 'Meat & Seafood':
                prices *= 1.2
            elif category == 'Indian':
                prices *= 0.9
            temp_df = pd.DataFrame({
                'item': [item] * desired_count_per_item,  # Renamed to 'item'
                'price': prices,
                'category': [category] * desired_count_per_item,  # Consistent name
                'main_category': ['Food'] * desired_count_per_item #Added main category
            })
            data_food.append(temp_df)

df_food = pd.concat(data_food, ignore_index=True)
df_food.dropna(inplace=True)
df_food = df_food[df_food['price'] > 0]
df_food = df_food[df_food['item'].str.len() > 2]
desired_count_per_item_electronics = 10 
desired_count_per_item_clothing = 10     
desired_count_per_item_others =10
shopping_categories = {
    'Electronics': ['Smartphones', 'Laptops', 'TVs', 'Headphones', 'Cameras', 'Smartwatches', 'Tablets', 'Gaming Consoles', "Smart Home Devices", "Audio Equipment"], # New sub-categories
    'Clothing': ['Men\'s Apparel', 'Women\'s Apparel', 'Shoes', 'Accessories', "Kids' Clothing", "Ethnic Wear"],
    'Home & Kitchen': ['Cookware', 'Furniture', 'Appliances', 'Decor', "Kitchen Tools", "Bedding", "Bath"],
    'Beauty & Personal Care': ['Skincare', 'Makeup', 'Haircare', 'Fragrances', "Men's Grooming", "Personal Care Appliances"],
    'Books & Stationery': ['Books', 'Notebooks', 'Pens', 'Office Supplies', "Art Supplies", "School Supplies"],
    'Sports & Outdoors': ['Exercise Equipment', 'Camping Gear', 'Sports Apparel', 'Outdoor Recreation', "Cycling", "Water Sports"],
    'Toys & Games': ['Board Games', 'Action Figures', 'Dolls', 'Puzzles', "Educational Toys", "Outdoor Play"],
    'Automotive': ['Car Parts', 'Car Accessories', 'Tools', 'Tires', "Motorcycle Accessories", "Car Care"],
    'Health & Wellness': ['Vitamins', 'First Aid', 'Personal Care', 'Fitness Trackers', "Supplements", "Medical Supplies"]
}

electronics_brands = {
    'Smartphones': ['iphone', 'Samsung', 'Google', 'OnePlus', 'Xiaomi', 'Oppo', 'Vivo', 'Realme', 'Motorola', 'Nokia', 'Infinix'], 
    'Laptops': ['MAC', 'Dell', 'HP', 'Lenovo', 'ASUS', 'Acer', 'Microsoft Surface', 'MSI', 'Razer', 'LG Gram'], 
    'TVs': ['Samsung', 'LG', 'Sony', 'TCL', 'Hisense', 'OnePlus', 'Xiaomi', 'Vu', 'Panasonic', 'Philips'], 
    'Headphones': ['Sony', 'Bose', 'Airpods', 'Sennheiser', 'JBL', 'Boat', 'Skullcandy', 'Jabra', 'Audio-Technica', 'Beyerdynamic', 'OnePlus Buds'],
    'Cameras': ['Canon', 'Sony', 'Nikon', 'Fujifilm', 'Panasonic', 'GoPro', 'Olympus', 'DJI', 'Insta360', 'Ricoh'],
    'Smartwatches': ['Apple Watch', 'Samsung', 'Google', 'Garmin', 'Fitbit', 'Amazfit', 'Noise', 'Fossil', 'TicWatch', 'Huawei'],
    'Tablets': ['ipad', 'Samsung', 'Microsoft', 'Lenovo', 'Amazon', 'Huawei', 'Realme', 'Alcatel', 'Xiaomi Pad'],
    'Gaming Consoles': ['Sony', 'Microsoft', 'Nintendo', 'Valve', 'ASUS', 'Nvidia Shield', 'Logitech G Cloud'],
    "Smart Home Devices": ['Amazon Echo', 'Google Nest', 'Apple HomePod', 'Philips Hue', 'Xiaomi Smart Home', 'TP-Link Kasa'], 
    "Audio Equipment": ['Bose', 'Sonos', 'JBL', 'Yamaha', 'Sony', 'Denon', 'Marantz', 'Marshall'] 
}

clothing_brands = {
    'Men\'s Apparel': ['Nike', 'Adidas', 'Under Armour', 'Levi\'s', 'US Polo', 'Park Avenue',
                      'Allen Solly', 'Peter England', 'Louis Vuitton', 'Van Heussen', 'Raymond', 'Manyavar', 'FabIndia', 'Mufti', 'Indian Terrain'],
    'Women\'s Apparel': ['Zara', 'H&M', 'Forever 21', 'ASOS', 'Mango', 'Biba', 'W for Woman', 'Global Desi', 'Pantaloons', 'Aurelia', 'Libas', 'Soch'],
    'Shoes': ['Nike', 'Adidas', 'Puma', 'Reebok', 'New Balance','Crocs', 'Sketches', 'RedTape', 'Bata', 'Liberty', 'Paragon', 'Woodland', 'Metro Shoes', 'Mochi'],  
    'Accessories': ['Ray-Ban', 'Fossil', 'Michael Kors', 'Gucci', 'Prada', 'Titan', 'Fastrack', 'Lavie', 'Caprese', 'Hidesign'],
    "Kids' Clothing": ['Gini & Jony', 'Lilliput', 'Pantaloons Junior', 'Max Kids', 'Allen Solly Junior', 'Mothercare'],
    "Ethnic Wear": ['Manyavar', 'FabIndia', 'Biba', 'W for Woman', 'Global Desi', 'Soch', 'Meena Bazaar', 'Nalli Silks']
}
home_kitchen_brands = {
    'Cookware': ['Prestige', 'Hawkins', 'Pigeon', 'Cuisinart', 'Calphalon', 'Vinod', 'Borosil', 'Wonderchef'],  
    'Furniture': ['Godrej Interio', 'Nilkamal', 'IKEA', 'Ashley', 'Pepperfry', 'Urban Ladder', 'Durian', 'Wakefit'],  
    'Appliances': ['Samsung', 'LG', 'Whirlpool', 'Godrej', 'Bajaj', 'IFB', 'Voltas', 'Haier', 'Blue Star'],
    'Decor': ['Home Centre', 'FabIndia', 'IKEA', 'Target', 'Chumbak', '@Home', 'D\'Decor', 'Bombay Dyeing'],  
    "Kitchen Tools": ['Pigeon', 'Prestige', 'Tupperware', 'Cello', 'Wonderchef'],  # New category
    "Bedding": ['Bombay Dyeing', 'Spaces', 'Portico New York', 'Raymond Home', 'D\'Decor'],
    "Bath": ['Hindware', 'Jaquar', 'Cera', 'Parryware', 'Kohler']
}

beauty_personal_care_brands = {
    'Skincare': ['Himalaya', 'Biotique', 'Patanjali', 'CeraVe', 'The Ordinary', 'Forest Essentials', 'Kama Ayurveda', 'VLCC'],  
    'Makeup': ['Lakmé', 'Maybelline', 'L\'Oréal', 'MAC', 'Colorbar', 'Sugar Cosmetics', 'Faces Canada', 'Nykaa Cosmetics'], 
    'Haircare': ['Pantene', 'Head & Shoulders', 'L\'Oréal', 'Dove', 'Tresemmé', 'Indulekha', 'Biotique', 'Sunsilk'], 
    'Fragrances': ['Fogg', 'Axe', 'Park Avenue', 'Chanel', 'Dior', 'Engage', 'Yardley', 'Titan Skinn'], 
    "Men's Grooming": ['Beardo', 'The Man Company', 'Ustraa', 'Bombay Shaving Company', 'Nivea Men', 'Gillette'],
    "Personal Care Appliances": ['Philips', 'Braun', 'Panasonic', 'Syska', 'Vega']
}

books_stationery_brands = {
    'Books' : ['Rupa Publications', 'Penguin India', 'HarperCollins India', 'Westland', 'Bloomsbury India', 'Jaico Publishing House', 'Srishti Publishers'],  # Indian publishers
    'Notebooks': ['Classmate', 'Navneet', 'Moleskine', 'Leuchtturm1917', 'Rhodia', 'Sundaram', 'Bilt Matrix'], 
    'Pens': ['Cello', 'Reynolds', 'Pilot', 'Parker', 'Linc', 'Flair', 'Montex', 'Camlin'], 
    'Office Supplies': ['3M', 'Post-it', 'Camlin', 'Faber-Castell', 'Kores', 'Kangaro', 'Nataraj'], 
    "Art Supplies": ['Camlin', 'Faber-Castell', 'Brustro', 'DOMS'],
    "School Supplies": ['Camlin', 'Classmate', 'Navneet', 'Faber-Castell']
}
sports_outdoors_brands = {
    'Exercise Equipment': ['Cosco',  'Nivia',  'NordicTrack', 'Peloton', 'Vector X', 'Kobo', 'Proline Fitness', 'Afton'], 
    'Camping Gear': ['Wildcraft', 'The North Face', 'Patagonia', 'Coleman', 'Quechua', 'Decathlon', 'Columbia'], 
    'Sports Apparel': ['Nike', 'Adidas', 'Under Armour', 'Puma', 'Reebok', 'Shiv Naresh', 'Nivia', 'Decathlon', 'HRX by Hrithik Roshan'], 
    'Outdoor Recreation': ['YETI', 'Osprey', 'Black Diamond', 'Garmin', 'GoPro', 'Decathlon', 'Wildcraft'],
    "Cycling": ['Hero Cycles', 'Firefox', 'Giant', 'Trek', 'BSA', 'Hercules'],
    "Water Sports": ['Speedo', 'Arena', 'TYR', 'Decathlon', 'Cosco']
}

toys_games_brands = {
	'Board Games': ['Funskool', 'Hasbro', 'Mattel', 'Asmodee', 'Zephyr Toys', 'Skillmatics', 'Ratna\'s'],  
    'Action Figures': ['Hasbro', 'Mattel', 'NECA', 'Funko', 'McFarlane Toys'],
    'Dolls': ['Mattel', 'Hasbro', 'MGA Entertainment', 'Barbie', 'American Girl'],
    'Puzzles': ['Ravensburger', 'Frank', 'Buffalo Games', 'Melissa & Doug', 'Ceaco'], # Added Frank
    "Educational Toys": ['Funskool', 'Skillmatics', 'Zephyr Toys', 'Smartivity', 'Einstein Box'],
    "Outdoor Play": ['Funskool', 'Intex', 'Bestway', 'Little Tikes']
}


automotive_brands = {'Car Parts': ['Bosch', 'MRF', 'TVS', 'ACDelco', 'Motherson Sumi', 'Exide', 'Amara Raja', 'Endurance Technologies'], 
                     'Car Accessories': ['WeatherTech', 'Covercraft', '3M', 'JBL', 'Sony', 'Pioneer', 'Blaupunkt', 'Garmin'], 
                     'Tools': ['Stanley', 'DeWalt', 'Taparia', 'Eastman', 'JK Files', 'Bosch', 'Black+Decker'], 
                     'Tires': ['MRF', 'Apollo', 'CEAT', 'Michelin', 'Goodyear', 'JK Tyre', 'Bridgestone', 'Continental'], 
                     'Motorcycle Accessories': ['Studds', 'Vega', 'Steelbird', 'Royal Enfield Accessories', 'Axor'], 
                     'Car Care': ['3M Car Care', 'Meguiar\'s', 'Formula 1', 'Turtle Wax', 'Wavex'], 
                     'Car Brands': ['Tata Motors', 'Mahindra & Mahindra', 'Maruti Suzuki', 'Ashok Leyland', 'Force Motors', 'Hindustan Motors', 'Hyundai', 'Toyota', 
                                    'Honda', 'Kia', 'MG Motor', 'Renault', 'Nissan', 'Volkswagen', 'Skoda', 'Ford', 'Jeep', 'Citroen', 'Mercedes-Benz', 'BMW', 'Audi', 
                                    'Jaguar', 'Land Rover', 'Volvo', 'Lexus', 'Porsche'],
                     'Two Wheeler Brands': ['Hero MotoCorp', 'Bajaj Auto', 'TVS Motor Company', 'Royal Enfield', 'Honda Motorcycle & Scooter India', 'Yamaha Motor India', 
                                            'Suzuki Motorcycle India', 'KTM', 'Piaggio'], 
                     'Commercial Vehicle Brands': ['Tata Motors', 'Mahindra & Mahindra', 'Ashok Leyland', 'Eicher Motors', 'Force Motors', 'BharatBenz', 'Swaraj Mazda']}

health_wellness_brands = {
    'Vitamins': ['Himalaya', 'Patanjali', 'Nature Made', 'Centrum', 'Amway', 'HealthKart', 'MuscleBlaze'],  
    'First Aid': ['Johnson & Johnson', 'Dettol', 'Savlon', 'Band-Aid', 'Hansaplast', 'Crepe Bandage'],
    'Personal Care': ['Dove', 'Nivea', 'Patanjali', 'Himalaya', 'Dabur', 'Biotique', 'VLCC'],
    'Fitness Trackers': ['Fitbit', 'Garmin', 'Mi', 'Noise', 'GOQii', 'OnePlus Band', 'Realme Band'], 
    "Supplements": ['MuscleBlaze', 'Optimum Nutrition', 'Myprotein', 'GNC', 'Amway Nutrilite', 'HealthKart'],
    "Medical Supplies": ['Dr. Morepen', 'Hansaplast', 'Flamingo', 'Tynor']
}

all_shopping_items = {
    'Electronics': electronics_brands,
    'Clothing': clothing_brands,
    'Home & Kitchen': home_kitchen_brands,
    'Beauty & Personal Care': beauty_personal_care_brands,
    'Books & Stationery': books_stationery_brands,
    'Sports & Outdoors': sports_outdoors_brands,
    'Toys & Games': toys_games_brands,
    'Automotive': automotive_brands,
    'Health & Wellness': health_wellness_brands
}

data_shopping = []

for category, subcategories in all_shopping_items.items():
    for subcategory, brands in subcategories.items():
          for brand in brands:
            if category == 'Electronics':
                prices = np.round(np.random.gamma(5, 50, desired_count_per_item_electronics) * 2, 2)
                desired_count = desired_count_per_item_electronics
            elif category == 'Clothing':
                prices = np.round(np.random.gamma(3, 10, desired_count_per_item_clothing) * 2, 2)
                desired_count = desired_count_per_item_clothing
            elif category == 'Automotive':
                prices = np.round(np.random.gamma(4, 25, desired_count_per_item_others) * 2, 2)
                desired_count = desired_count_per_item_others
            elif category == "Books & Stationery":
                prices = np.round(np.random.gamma(2, 5, desired_count_per_item_others) * 2 ,2)
                desired_count = desired_count_per_item_others
            else:
                prices = np.round(np.random.gamma(3, 15, desired_count_per_item_others) * 2, 2)
                desired_count = desired_count_per_item_others


            temp_df = pd.DataFrame({
                'item': [brand] * desired_count, 
                'price': prices,
                'category': [category] * desired_count,  
                'main_category': ['Shopping'] * desired_count
            })
            data_shopping.append(temp_df)

df_shopping = pd.concat(data_shopping, ignore_index=True)
df_shopping.dropna(inplace=True)
df_shopping = df_shopping[df_shopping['price'] > 0]
df_shopping = df_shopping[df_shopping['item'].str.len() > 1]
transportation_categories = {
    'Public Transportation': ['Bus', 'Subway', 'Train', 'Tram', 'Ferry', 'Airplane'],
    'Shared Transportation': ['Rideshare', 'Car Sharing', 'Bike Sharing', 'Scooter Sharing']
}

transportation_options = {
    'Public Transportation': {
        'Bus': ['City Bus', 'Coach Bus', 'Express Bus', 'Double-Decker Bus', 'Minibus'],
        'Subway': ['Local Train', 'Express Train'],
        'Train': ['Commuter Rail', 'Intercity Train', 'High-Speed Train', 'Sleeper Train'],
        'Tram': ['Streetcar', 'Light Rail'],
        'Ferry': ['Passenger Ferry', 'Car Ferry'],
        'Airplane': ['Domestic Flight', 'International Flight', 'Budget Airline', 'Premium Economy', 'Business Class', 'First Class']
    },
    'Shared Transportation': {
        'Rideshare': ['Uber', 'Lyft', 'Taxi', 'Auto'],
        'Car Sharing': ['Zipcar', 'Turo', 'Getaround'],
        'Bike Sharing': ['Citi Bike', 'Lime', 'Jump', 'Divvy'],
        'Scooter Sharing': ['Lime', 'Bird', 'Spin', 'Tier']
    }
}

data_transportation = []

for category, subcategories in transportation_options.items():
    for subcategory, options in subcategories.items():
        for option in options:
            if category == 'Public Transportation':
                if subcategory == 'Airplane':
                    prices = np.round(np.random.gamma(3, 40, desired_count_per_item) * 3, 2)
                else:
                    prices = np.round(np.random.uniform(1, 10, desired_count_per_item), 2)
            elif category == 'Shared Transportation':
                if subcategory == 'Rideshare':
                    prices = np.round(np.random.gamma(2, 5, desired_count_per_item) * 1.5, 2)
                elif subcategory in ('Car Sharing', 'Bike Sharing', 'Scooter Sharing'):
                    prices = np.round(np.random.gamma(2, 3, desired_count_per_item), 2)
                else:
                    prices = np.round(np.random.uniform(1,5, desired_count_per_item),2)
            temp_df = pd.DataFrame({
                'item': [option] * desired_count_per_item,
                'price': prices,
                'category': [category] * desired_count_per_item,
                'main_category': ['Transportation'] * desired_count_per_item
            })
            data_transportation.append(temp_df)

df_transportation = pd.concat(data_transportation, ignore_index=True)
df_transportation.dropna(inplace=True)
df_transportation = df_transportation[df_transportation['price'] >= 0]
df_merged = pd.concat([df_food, df_shopping, df_transportation], ignore_index=True)
df_merged.dropna(inplace=True)
df_merged = df_merged[df_merged['price'] >= 0]
print(df_merged.groupby(['item', 'category', 'main_category']).size().reset_index(name='count'))

# Save to CSV
df_merged.to_csv('synthetic_merged_data.csv', index=False)
print(f"\nGenerated {len(df_merged)} rows of data.  File saved as 'synthetic_merged_data.csv'")

            item               category   main_category  count
0        2% Milk           Dairy & Eggs            Food     10
1             3M             Automotive        Shopping     10
2             3M     Books & Stationery        Shopping     10
3    3M Car Care             Automotive        Shopping     10
4          @Home         Home & Kitchen        Shopping     10
..           ...                    ...             ...    ...
684  Zephyr Toys           Toys & Games        Shopping     20
685       Zipcar  Shared Transportation  Transportation     10
686     Zucchini                Produce            Food     10
687         ipad            Electronics        Shopping     10
688       iphone            Electronics        Shopping     10

[689 rows x 4 columns]

Generated 7560 rows of data.  File saved as 'synthetic_merged_data.csv'


In [481]:
import sklearn 
from sklearn.preprocessing import LabelEncoder
import catboost
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [555]:
df_merged = pd.read_csv('synthetic_merged_data.csv')
df_merged.head()

Unnamed: 0,item,price,category,main_category
0,Apple,1.19,Produce,Food
1,Apple,7.02,Produce,Food
2,Apple,5.13,Produce,Food
3,Apple,8.37,Produce,Food
4,Apple,0.67,Produce,Food


In [556]:
#PREDICTING MAIN_CATEGORY

In [557]:
le = LabelEncoder()

In [558]:
x = df_merged[['item']]
y = df_merged['main_category']
x['item'] = x['item'].astype(str)

In [559]:
y = le.fit_transform(y)

In [560]:
cat_features = [0]
model_shop = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    verbose=10,
    cat_features = cat_features
)
model_shop.fit(x,y)

0:	learn: 0.9232751	total: 23.3ms	remaining: 2.3s
10:	learn: 0.2983214	total: 342ms	remaining: 2.77s
20:	learn: 0.1535253	total: 654ms	remaining: 2.46s
30:	learn: 0.1056985	total: 974ms	remaining: 2.17s
40:	learn: 0.0849907	total: 1.38s	remaining: 1.98s
50:	learn: 0.0779781	total: 1.73s	remaining: 1.66s
60:	learn: 0.0750743	total: 2.13s	remaining: 1.36s
70:	learn: 0.0734783	total: 2.52s	remaining: 1.03s
80:	learn: 0.0727290	total: 2.88s	remaining: 677ms
90:	learn: 0.0723322	total: 3.29s	remaining: 326ms
99:	learn: 0.0720842	total: 3.62s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x15713fda440>

In [561]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [562]:
test_pool = Pool(x_test, cat_features = [0])
y_pred = model_shop.predict(test_pool)
score = accuracy_score(y_test, y_pred)
score

0.9989417989417989

In [563]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [564]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
item_vectors = vectorizer.fit_transform(df_merged['item'])
nn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model.fit(item_vectors)

In [565]:
def get_similar_brand(item_name, n, threshold=0.3):
    if item_name in df_merged['item'].values:
        item_index = df_merged[df_merged['item'] == item_name].index[0]
        input_vector = item_vectors[item_index]
    else:
        try:
            input_vector = vectorizer.transform([item_name])
            if np.all(input_vector.toarray() == 0):
                return pd.DataFrame({'item': ['Others']})
        except ValueError:
            return pd.DataFrame({'item': ['Others']})

    distances, indices = nn_model.kneighbors(input_vector, n_neighbors=min(n, len(df_merged)))
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if dist <= threshold:
            results.append(df_merged.iloc[idx]['item'])
    if not results:
        return pd.DataFrame({'item': ['Others']})

    return pd.DataFrame({'item': results}).drop_duplicates()


In [568]:
similar_item = get_similar_brand('volvo', 1, threshold=0.3).values
if similar_item == 'Others':
    result = 'Others'
else:
    pred_pool = Pool(similar_item.reshape(1, -1), cat_features=cat_features)
    y_pred = model_shop.predict(pred_pool)
    result = le.inverse_transform(y_pred)

In [569]:
result

array(['Shopping'], dtype=object)

In [252]:
#PREDICTING CATEGORY

In [498]:
le1 = LabelEncoder()

In [499]:
x1 = df_merged[['item']]
y1 = df_merged['category']
x1['item'] = x1['item'].astype(str)

In [500]:
y1 = le1.fit_transform(y1)

In [501]:
cat_features = [0]
model_shop = CatBoostClassifier(
    iterations=300,
    learning_rate=0.1,
    depth=6,
    verbose=10,
    cat_features = cat_features
)
model_shop.fit(x1,y1)

0:	learn: 2.4491843	total: 111ms	remaining: 33.1s
10:	learn: 1.1482699	total: 1.13s	remaining: 29.6s
20:	learn: 0.8071385	total: 2.18s	remaining: 29s
30:	learn: 0.6368930	total: 3.35s	remaining: 29.1s
40:	learn: 0.5436531	total: 4.41s	remaining: 27.9s
50:	learn: 0.4896938	total: 5.46s	remaining: 26.7s
60:	learn: 0.4509513	total: 6.55s	remaining: 25.7s
70:	learn: 0.4253922	total: 7.62s	remaining: 24.6s
80:	learn: 0.4086717	total: 8.63s	remaining: 23.3s
90:	learn: 0.3976587	total: 9.63s	remaining: 22.1s
100:	learn: 0.3864800	total: 10.7s	remaining: 21s
110:	learn: 0.3764807	total: 11.7s	remaining: 20s
120:	learn: 0.3696363	total: 12.8s	remaining: 18.9s
130:	learn: 0.3639872	total: 13.9s	remaining: 17.9s
140:	learn: 0.3589365	total: 15s	remaining: 16.9s
150:	learn: 0.3551688	total: 16s	remaining: 15.8s
160:	learn: 0.3520983	total: 17s	remaining: 14.7s
170:	learn: 0.3489604	total: 18s	remaining: 13.6s
180:	learn: 0.3452940	total: 19s	remaining: 12.5s
190:	learn: 0.3420093	total: 20.1s	rema

<catboost.core.CatBoostClassifier at 0x15714acd8a0>

In [502]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.25)

In [503]:
test_pool1 = Pool(x1_test, cat_features = [0])
y1_pred = model_shop.predict(test_pool1)
score1 = accuracy_score(y1_test, y1_pred)
score1

0.9523809523809523

In [504]:
vectorizer1 = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
item_vectors1 = vectorizer1.fit_transform(df_merged['item'])
nn_model1 = NearestNeighbors(n_neighbors=5, metric='cosine')
nn_model1.fit(item_vectors)

In [519]:
def get_similar_brand1(item_name1, n1, threshold=0.3):
    if item_name1 in df_merged['item'].values:
        item_index1 = df_merged[df_merged['item'] == item_name1].index[0]
        input_vector1 = item_vectors1[item_index1]
    else:
        try:
            input_vector1 = vectorizer1.transform([item_name1])
            if np.all(input_vector1.toarray() == 0):   
                return pd.DataFrame({'item': ['Others']})
        except ValueError:
            return pd.DataFrame({'item': ['Others']})

    distances1, indices1 = nn_model1.kneighbors(input_vector1, n_neighbors=min(n1, len(df_merged)))

    results1 = []
    for idx, dist in zip(indices1[0], distances1[0]):
        if dist <= threshold: 
            results1.append(df_merged.iloc[idx]['item'])

    if not results1:  # If no valid match found
        return pd.DataFrame({'item': ['Others']})

    return pd.DataFrame({'item': results1}).drop_duplicates()


In [520]:
similar_item1 = get_similar_brand1('chicken butter masala', 1).values
if similar_item1 == 'Others':
    result = 'Others'
else:
    pred_pool1 = Pool(similar_item1.reshape(1, -1), cat_features=cat_features)
    y1_pred = model_shop.predict(pred_pool1)
    result = le1.inverse_transform(y1_pred)

In [521]:
result

array(['Indian'], dtype=object)