#Import necessary libraries


In [None]:
import pandas as pd
import requests
import json
import time

#This script scrapes detailed restaurant and menu data using restaurant IDs.
It enriches menu items with inferred spice levels, gluten-free tags, and allergen info.
All data is saved in a structured JSON format (rag_knowledge_base.json) for use in a RAG-based chatbot.

Key features:
- Extracts restaurant metadata, offers, cuisines, location
- Tags menu items as gluten-free and infers spice level from descriptions
- Adds allergen and vegan indicators
- Saves a clean, chatbot-ready knowledge base

In [None]:
restaurant_ids = [
    '545333','993611','113744', '898178','326440', '1018574','210285', '1063422',  '426597',
   '1023795', '426151','253765', '200417', '215656', '999058','547809','108986','396239',
    '1075083', '1019558',  '390161','770772','1000505', '660597', '117228','952036',
    '395878', '215232', '290381', '890738', '116534', '1079731', '1027017',
    '932681',   '935777', '917285', '116439', '650969','341805','860970',
      '116525',   '117155', '1024542','564070','485771','25251',
    '244483', '1025133', '320649',  '117150','1019557',
    '216340', '903323', '392972', '635891', '245321',

]

headers = {
    "User-Agent": "Mozilla/5.0"
}


use_delay = True
delay_secs = 1.2


gluten_dishes = set()
try:
    gluten_url = 'https://www.swiggy.com/dapi/restaurants/search/v3?lat=29.86370&lng=77.88350&str=Gluten%20Free'
    gluten_response = requests.get(gluten_url, headers=headers)
    gluten_json = gluten_response.json()

    for card in gluten_json.get('data', {}).get('cards', []):
        for item in card.get('groupedCard', {}).get('cardGroupMap', {}).get('DISH', {}).get('cards', []):
            info = item.get('card', {}).get('card', {}).get('info', {})
            if info:
                gluten_dishes.add(info.get('name', '').lower())
except Exception as e:
    print(f"[⚠] Failed to fetch gluten-free dishes: {e}")

all_data = []

for idx, restro_id in enumerate(restaurant_ids, start=1):
    try:
        url = f'https://www.swiggy.com/mapi/menu/pl?page-type=REGULAR_MENU&complete-menu=true&lat=29.86370&lng=77.88350&restaurantId={restro_id}'
        res = requests.get(url, headers=headers)
        data = res.json()

        info = data['data']['cards'][2]['card']['card']['info']
        menu_cards = data['data']['cards'][5]['groupedCard']['cardGroupMap']['REGULAR']['cards']

        restaurant = {
            "restaurant_id": restro_id,
            "restaurant_name": info.get("name"),
            "location": info.get("areaName"),
            "address": next((label['message'] for label in info.get('labels', []) if label.get('title') == 'Address'), "Not Available"),
            "latitude": float(info.get("latLong", "0,0").split(',')[0]),
            "longitude": float(info.get("latLong", "0,0").split(',')[1]),
            "cuisines": info.get("cuisines", []),
            "offers": info.get("aggregatedDiscountInfo", {}).get("descriptionList", []),
            "pure_vegetarian": info.get("veg", False),
            "vegan": "Vegan" in info.get("cuisines", []),
            "allergen_info": None,
            "operating_hours": info.get("availability", {}).get("nextCloseTime", "Not Available"),
            "menu": []
        }

        for label in info.get('labels', []):
            msg = label.get('message', '').lower()
            if "allergen" in msg:
                restaurant["allergen_info"] = label.get('message')

        for card in menu_cards:
            card_info = card.get('card', {}).get('card', {})
            if 'itemCards' in card_info:
                category = card_info.get('title', 'General')
                for item in card_info['itemCards']:
                    item_info = item['card']['info']
                    name = item_info.get('name', '')
                    description = item_info.get('description', '') or ""
                    spice_level = "Medium"
                    if "spicy" in description.lower():
                        spice_level = "High"
                    elif "mild" in description.lower():
                        spice_level = "Low"

                    restaurant["menu"].append({
                        "name": name,
                        "description": description,
                        "category": category,
                        "price": (item_info.get('price') or item_info.get('defaultPrice') or 0) / 100,
                        "rating": item_info.get("ratings", {}).get("aggregatedRating", {}).get("rating"),
                        "rating_count": item_info.get("ratings", {}).get("aggregatedRating", {}).get("ratingCountV2", "0 ratings"),
                        "spice_level": spice_level,
                        "gluten_free":  "gluten" in name.lower() or "gluten" in description.lower()
                    })

        all_data.append(restaurant)
        print(f"[✔] ({idx}/{len(restaurant_ids)}) Extracted: {restaurant['restaurant_name']}")

        if use_delay:
            time.sleep(delay_secs)

    except Exception as e:
        print(f"[✘] ({idx}/{len(restaurant_ids)}) Failed for restaurant ID {restro_id}: {e}")
        continue

with open("rag_knowledge_base.json", "w", encoding="utf-8") as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

print("\n✅ RAG knowledge base saved to 'rag_knowledge_base.json'")


[✔] (1/56) Extracted: Chinese Wok
[✔] (2/56) Extracted: Signature Cafe And Restaurant
[✔] (3/56) Extracted: Pluto's Restaurant
[✔] (4/56) Extracted: Pal Point
[✔] (5/56) Extracted: Punjabi Angithi (Vegorama Group)
[✔] (6/56) Extracted: Donne Biryani @99
[✔] (7/56) Extracted: Snacks Point
[✔] (8/56) Extracted: Le
[✔] (9/56) Extracted: The Cave- Hotel Siddharth
[✔] (10/56) Extracted: Ember
[✔] (11/56) Extracted: Motel Blue Sapphire
[✔] (12/56) Extracted: McDonald's
[✔] (13/56) Extracted: Da Vinci's
[✔] (14/56) Extracted: Dehradun Noodles Point
[✔] (15/56) Extracted: Punjabi Food Junction
[✔] (16/56) Extracted: Theobroma
[✔] (17/56) Extracted: NIC Ice Creams
[✔] (18/56) Extracted: NIC Ice Creams
[✔] (19/56) Extracted: Bittu Sweets
[✔] (20/56) Extracted: Pinch Of Taste
[✔] (21/56) Extracted: NH1 Bowls - Highway To North
[✔] (22/56) Extracted: Olio - The Wood Fired Pizzeria
[✔] (23/56) Extracted: Asian Kitchen
[✔] (24/56) Extracted: Five Star Janta
[✔] (25/56) Extracted: Tamarind Restaurant

In [None]:
df_new=pd.read_json('rag_knowledge_base.json')
df_new.head()

Unnamed: 0,restaurant_id,restaurant_name,location,address,latitude,longitude,cuisines,offers,pure_vegetarian,vegan,allergen_info,operating_hours,menu
0,545333,Chinese Wok,Rajouri Garden,"First Floor, Block No- J2, Plot No- 21, Rajour...",28.646849,77.118479,"[Chinese, Asian]","[{'meta': '66% Off | Use SWIGGY6', 'discountTy...",False,False,,2025-04-23 01:00:00,[{'name': 'Veg Chilli Garlic Noodles - Half (5...
1,993611,Signature Cafe And Restaurant,IIT_Roorkee,"2 Civil Lines Roorkee ,Roorkee Town,Roorkee,Ha...",29.873412,77.894032,"[Snacks, Pizzas]",[{'meta': '50% off up to ₹100 | Use code TRYNE...,True,False,,2025-04-22 23:00:00,"[{'name': 'Cappuccino', 'description': '', 'ca..."
2,113744,Pluto's Restaurant,Kodambakkam,No 4 Khan Street Choolaimedu Hogh Road-600094,13.06545,80.23095,"[Indian, Chinese]",[{'meta': '30% off up to ₹75 | Use code TRYNEW...,False,False,,2025-04-22 23:59:00,"[{'name': 'Garlic Vegetable', 'description': '..."
3,898178,Pal Point,IIT_Roorkee,"Shop No : 0 , Floor : 0 , 3, Civil Lines, near...",29.874293,77.893944,"[Chinese, Pastas]","[{'meta': 'Flat ₹25 off| Above ₹299', 'discoun...",False,False,,2025-04-22 23:00:00,"[{'name': 'Kurkure momos', 'description': '',..."
4,326440,Punjabi Angithi (Vegorama Group),Paschim Vihar,"A4, 22 DDA market, Opp. Balaji Hospital, Pasch...",28.67274,77.109001,"[North Indian, Chinese]","[{'meta': 'Flat ₹166 Off | Use SWIGGY6', 'disc...",True,False,,2025-04-22 23:30:00,"[{'name': 'Punjabi Spicy Chaap Tikka', 'descri..."


In [None]:
df_new['menu'][2]


[{'name': 'Garlic Vegetable',
  'description': '',
  'category': 'Recommended',
  'price': 185.0,
  'rating': None,
  'rating_count': '0 ratings',
  'spice_level': 'Medium',
  'gluten_free': False},
 {'name': 'Fish Finger',
  'description': "Crispy and flavorful, this starter is a seafood lover's delight.",
  'category': 'Recommended',
  'price': 295.0,
  'rating': '5.0',
  'rating_count': '19',
  'spice_level': 'Medium',
  'gluten_free': False},
 {'name': 'Chicken Lollipop',
  'description': 'Juicy, tender chicken coated in a delectable blend of flavors, perfect for those seeking a tantalizing non-veg starter.',
  'category': 'Recommended',
  'price': 285.0,
  'rating': '5.0',
  'rating_count': '27',
  'spice_level': 'Medium',
  'gluten_free': False},
 {'name': 'Dragon Chicken',
  'description': 'Savor the exotic flavors of this delectable non-vegetarian starter that will transport your taste buds on a flavorful journey.',
  'category': 'Recommended',
  'price': 285.0,
  'rating': '3.

In [None]:
count=0
for menu in df_new['menu']:
    for item in menu:
       if (item['gluten_free'])==True:
        count+=1
print(count)





21


# check for null values in my data

In [None]:
 df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   restaurant_id    64 non-null     int64  
 1   restaurant_name  64 non-null     object 
 2   location         64 non-null     object 
 3   address          64 non-null     object 
 4   latitude         64 non-null     float64
 5   longitude        64 non-null     float64
 6   cuisines         64 non-null     object 
 7   offers           64 non-null     object 
 8   pure_vegetarian  64 non-null     bool   
 9   vegan            64 non-null     bool   
 10  allergen_info    0 non-null      float64
 11  operating_hours  64 non-null     object 
 12  menu             64 non-null     object 
dtypes: bool(2), float64(3), int64(1), object(7)
memory usage: 5.8+ KB


# Dropping 'allergen_info' (all values are NaN) and 'restaurant_id' (not useful for analysis)

In [None]:
df_new.drop(columns=['restaurant_id','allergen_info'],axis=1,inplace=True)

# save the data

In [None]:
df_new.to_json('data_json')