In [45]:
import json
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [46]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh;'
                         ' Intel Mac OS X 10_15_4)'
                         ' AppleWebKit/537.36 (KHTML, like Gecko)'
                         ' Chrome/83.0.4103.97 Safari/537.36'}


In [49]:
def get_info(url):
    """ Get Information about the restaurant from URL """
    global headers
    webpage = requests.get(url, headers=headers, timeout=3)
    html_text = BeautifulSoup(webpage.text, 'lxml')
    
    scripts = html_text.find_all('script', type='application/ld+json')
    info = None

    for script in scripts:
        try:
            parsed = json.loads(script.string)
            if isinstance(parsed, dict) and parsed.get('@type') in ['Restaurant', 'LocalBusiness']:
                info = parsed
                break
        except Exception:
            continue

    if not info:
        print(f"[WARNING] Structured data not found for URL: {url}")
        return [None]*18  # or use placeholders

    address = info.get('address', {})
    geo = info.get('geo', {})
    rating = info.get('aggregateRating', {})

    data = (
        info.get('name'),
        info.get('openingHours'),
        address.get('streetAddress'),
        address.get('addressLocality'),
        address.get('addressRegion'),
        address.get('postalCode'),
        address.get('addressCountry'),
        info.get('telephone'),
        info.get('servesCuisine'),
        
    )
    return data

def save_df(file_name, df):
    """ Save the dataframe """

    df.to_csv(file_name, index=False)
    

def get_restaurant_info(url_list, save=True, file_name="Restaurants.csv"):
    """ Get Restaurant Information from all urls passed """

    # Collecting the data
    data = []
    for url in url_list:
        data.append(get_info(url))
        
    # Creating the DataFrame
    columns = ['Name','Opening_Hours',
               'Street', 'Locality', 'Region', 'PostalCode', 'Country','Phone',
               'Cuisine']
    info_df = pd.DataFrame(data, columns=columns)
    
    # Save the df
    if save:
        save_df(file_name, info_df)
        
    return info_df

In [51]:
if __name__ == "__main__":
    urls = ['https://www.zomato.com/roorkee/tamarind-restaurant-roorkee-locality/order',
    'https://www.zomato.com/roorkee/olive-multicuisine-restaurant-roorkee-locality/order',
    'https://www.zomato.com/roorkee/rustic-house-roorkee-locality/order',
    'https://www.zomato.com/roorkee/hangries-roorkee-locality/order',
    'https://www.zomato.com/roorkee/milk-bar-roorkee-locality/order',
    'https://www.zomato.com/roorkee/desi-tadka-2-roorkee-locality/order',
    'https://www.zomato.com/roorkee/prakash-restaurant-roorkee-locality/order',
    'https://www.zomato.com/roorkee/hitchki-roorkee-locality/order',
    'https://www.zomato.com/roorkee/tanishas-restaurant-royal-hyderabadi-biryani-roorkee-locality/order',
    'https://www.zomato.com/roorkee/18-down-town-pool-restaurant-roorkee-locality/order']
    get_restaurant_info(urls)

In [71]:
import os
import pandas as pd
import re
import json
import random
from collections import defaultdict

class RestaurantKnowledgeBase:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.knowledge_base = defaultdict(dict)
        self.indexes = {
            'by_cuisine': defaultdict(list),
            'by_location': defaultdict(list),
            'by_opening_status': defaultdict(list),
            'by_price_range': defaultdict(list)
        }
        self.menu_templates = {
            'Chinese': [
                {"name": "Vegetable Spring Rolls", "price": 120, "category": "Starters"},
                {"name": "Chicken Manchurian", "price": 220, "category": "Main Course"},
                {"name": "Veg Fried Rice", "price": 180, "category": "Main Course"},
                {"name": "Chilli Chicken", "price": 250, "category": "Main Course"},
                {"name": "Schezwan Noodles", "price": 200, "category": "Main Course"}
            ],
            'North Indian': [
                {"name": "Paneer Tikka", "price": 220, "category": "Starters"},
                {"name": "Dal Makhani", "price": 180, "category": "Main Course"},
                {"name": "Butter Chicken", "price": 280, "category": "Main Course"},
                {"name": "Garlic Naan", "price": 50, "category": "Breads"},
                {"name": "Gulab Jamun", "price": 90, "category": "Desserts"}
            ],
            'Biryani': [
                {"name": "Vegetable Biryani", "price": 200, "category": "Main Course"},
                {"name": "Chicken Biryani", "price": 250, "category": "Main Course"},
                {"name": "Mutton Biryani", "price": 350, "category": "Main Course"},
                {"name": "Raita", "price": 60, "category": "Sides"},
                {"name": "Mirchi Ka Salan", "price": 80, "category": "Sides"}
            ],
            'Italian': [
                {"name": "Margherita Pizza", "price": 250, "category": "Main Course"},
                {"name": "Pasta Alfredo", "price": 220, "category": "Main Course"},
                {"name": "Garlic Bread", "price": 120, "category": "Starters"},
                {"name": "Tiramisu", "price": 150, "category": "Desserts"},
                {"name": "Minestrone Soup", "price": 160, "category": "Starters"}
            ],
            'Desserts': [
                {"name": "Chocolate Brownie", "price": 120, "category": "Desserts"},
                {"name": "Ice Cream Sundae", "price": 150, "category": "Desserts"},
                {"name": "Cheesecake", "price": 180, "category": "Desserts"},
                {"name": "Gajar Ka Halwa", "price": 100, "category": "Desserts"}
            ]
        }

    
    def _generate_menu(self, cuisines):
        """Generate a menu based on restaurant cuisines"""
        menu = []
        seen_items = set()
        
        for cuisine in cuisines:
            base_cuisine = next((c for c in self.menu_templates.keys() if c.lower() in cuisine.lower()), None)
            if base_cuisine:
                for item in self.menu_templates[base_cuisine]:
                    if item['name'] not in seen_items:
                        menu.append(item)
                        seen_items.add(item['name'])
        
        beverages = [
            {"name": "Mineral Water", "price": 30, "category": "Beverages"},
            {"name": "Fresh Lime Soda", "price": 60, "category": "Beverages"},
            {"name": "Masala Chai", "price": 40, "category": "Beverages"},
            {"name": "Cold Coffee", "price": 90, "category": "Beverages"}
        ]
        
        menu.extend(random.sample(beverages, random.randint(2, 3)))
        
        if menu:
            prices = [item['price'] for item in menu]
            min_price, max_price = min(prices), max(prices)
            price_range = f"₹{min_price}-₹{max_price}"
        else:
            price_range = "₹200-₹500"
        
        return menu, price_range
    
    def _parse_opening_status(self, opening_str):
        """Determine if restaurant is open today"""
        if pd.isna(opening_str):
            return 'unknown'
        if '(Today)' in opening_str:
            return 'open_today'
        if 'Opens tomorrow' in opening_str:
            return 'opens_tomorrow'
        return 'unknown'
    
    def _normalize_opening_hours(self, opening_str):
        """Standardize opening hours format"""
        if pd.isna(opening_str):
            return None
        
        tomorrow_match = re.match(r'Opens tomorrow at (\d+:\d+|\d+)(am|pm)', opening_str, re.IGNORECASE)
        if tomorrow_match:
            time_part = tomorrow_match.group(1)
            period = tomorrow_match.group(2).lower()
            return f"Opens tomorrow at {time_part}{period}"
        
        if '(Today)' in opening_str:
            cleaned = opening_str.replace('(Today)', '').strip()
            cleaned = cleaned.replace('12midnight', '12:00am')
            cleaned = cleaned.replace('12noon', '12:00pm')
            cleaned = cleaned.replace('–', '-')
            return cleaned
        
        return opening_str

    
    def preprocess_data(self):
        """Clean and normalize the raw data"""
        
        self.df['Phone'] = self.df['Phone'].str.replace('"', '').str.replace("'", "")
        self.df['Cuisine'] = self.df['Cuisine'].str.split(', ')
        self.df['Region'] = self.df['Region'].str.lower()
        self.df['Locality'] = self.df['Locality'].str.lower()
        self.df['Opening_Status'] = self.df['Opening_Hours'].apply(self._parse_opening_status)
        self.df['Opening_Hours_Normalized'] = self.df['Opening_Hours'].apply(self._normalize_opening_hours)
        self.df['Full_Address'] = self.df.apply(
            lambda row: f"{row['Street']}, {row['Locality']}, {row['Region']}, {row['PostalCode']}, {row['Country']}",
            axis=1
        )
    
    def build_knowledge_base(self):
        """Create structured knowledge base with indexing"""
        for _, row in self.df.iterrows():
            restaurant_id = f"rest_{row.name}"
            menu, price_range = self._generate_menu(row['Cuisine'])
            
            self.knowledge_base[restaurant_id] = {
                'name': row['Name'],
                'address': {
                    'street': row['Street'],
                    'locality': row['Locality'],
                    'region': row['Region'],
                    'postal_code': row['PostalCode'],
                    'country': row['Country'],
                    'full_address': row['Full_Address']
                },
                'contact': {
                    'phone': row['Phone'].split(', ') if isinstance(row['Phone'], str) else [row['Phone']]
                },
                'cuisine': row['Cuisine'],
                'opening_info': {
                    'raw': row['Opening_Hours'],
                    'normalized': row['Opening_Hours_Normalized'],
                    'status': row['Opening_Status']
                },
                'menu': menu,
                'price_range': price_range
            }
            
            for cuisine in row['Cuisine']:
                self.indexes['by_cuisine'][cuisine.lower()].append(restaurant_id)
            
            self.indexes['by_location'][row['Locality']].append(restaurant_id)
            self.indexes['by_opening_status'][row['Opening_Status']].append(restaurant_id)
            self.indexes['by_price_range'][price_range].append(restaurant_id)
    
    def save_knowledge_base(self, output_dir):
        """Save knowledge base to JSON files"""
        try:
            os.makedirs(output_dir, exist_ok=True)
            kb_path = os.path.join(output_dir, "restaurants_knowledge_base.json")
            with open(kb_path, 'w', encoding='utf-8') as f:
                json.dump({
                    'restaurants': self.knowledge_base,
                    'indexes': self.indexes
                }, f, indent=2, ensure_ascii=False)
            
            print(f"Knowledge base saved to {os.path.abspath(output_dir)}")
            return True
        except Exception as e:
            print(f"Error saving knowledge base: {e}")
            return False
    
    

if __name__ == "__main__":
    try:
        kb = RestaurantKnowledgeBase("Restaurants.csv")
        kb.preprocess_data()
        kb.build_knowledge_base()
        

    except FileNotFoundError:
        print("Error: Input CSV file not found. Please check the file path.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")