In [10]:
import requests
from bs4 import BeautifulSoup
import re
import csv

def is_dish_name(text):
    """Heuristic: All uppercase or mostly uppercase words."""
    words = text.strip().split()
    if not words:
        return False
    upper_ratio = sum(1 for w in words if w.isupper()) / len(words)
    return upper_ratio > 0.6 or text.isupper()

def scrape_and_clean_paakashala(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    raw_text = soup.get_text(separator="\n")
    lines = [line.strip() for line in raw_text.splitlines() if line.strip()]

    cleaned_menu = []
    current_item = None

    for line in lines:
        if is_dish_name(line):
            # Save previous item if exists
            if current_item:
                cleaned_menu.append(current_item)
            # Start new item
            current_item = {"Item": line.strip(), "Description": ""}
        else:
            # Append to current description if exists
            if current_item:
                if current_item["Description"]:
                    current_item["Description"] += " " + line.strip()
                else:
                    current_item["Description"] = line.strip()

    # Add the last item
    if current_item:
        cleaned_menu.append(current_item)

    return cleaned_menu

# Run the scraper
url = "https://paakashala.com/paakashala-menu/"
menu_data = scrape_and_clean_paakashala(url)

# Save to CSV
csv_path = "paakashala_menu_cleaned.csv"
with open(csv_path, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=["Item", "Description"])
    writer.writeheader()
    writer.writerows(menu_data)

print(f"Scraped {len(menu_data)} menu items and saved to '{csv_path}'")


Scraped 201 menu items and saved to 'paakashala_menu_cleaned.csv'


In [11]:
import pandas as pd

# Load the CSV
df = pd.read_csv("paakashala_menu_cleaned.csv")

# Remove rows with empty or very short items
df = df[df["Item"].str.strip().str.len() > 2]

# Drop duplicates based on Item name
df = df.drop_duplicates(subset="Item", keep="first")

# Remove non-food entries by filtering out addresses or overly long descriptions
df = df[df["Description"].str.len() < 300]  # adjust if needed
df = df[~df["Item"].str.contains("ROAD|BENGALURU|KARNATAKA|INDIA", case=False)]

# Optional: reset index and sort
df = df.reset_index(drop=True)

# Save cleaned file
df.to_csv("paakashala_menu_final.csv", index=False)

print(f"Cleaned menu saved to 'paakashala_menu_final.csv' with {len(df)} unique items.")


Cleaned menu saved to 'paakashala_menu_final.csv' with 85 unique items.


In [13]:
import requests
from bs4 import BeautifulSoup
import csv

def extract_paakashala_info():
    url = "https://paakashala.com/restaurants-kanakapura-road/"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")

    # Initialize data dictionary
    data = {
        "Name": "Paakashala – Kanakapura Road",
        "Address": "",
        "Phone": "",
        "Email": "",
        "Hours": ""
    }

    # Extract address
    address_tag = soup.find(string=lambda text: "Kanakapura Rd" in text)
    if address_tag:
        data["Address"] = address_tag.strip()

    # Extract phone number
    phone_tag = soup.find(string=lambda text: "Phone" in text)
    if phone_tag:
        data["Phone"] = phone_tag.split(":")[-1].strip()

    # Extract email
    email_tag = soup.find(string=lambda text: "Email" in text)
    if email_tag:
        data["Email"] = email_tag.split(":")[-1].strip()

    # Extract operating hours
    timings_tag = soup.find(string=lambda text: "Timings" in text)
    if timings_tag:
        data["Hours"] = timings_tag.split(":")[-1].strip()

    return data

def save_to_csv(data, filename="paakashala_kanakapura_info.csv"):
    with open(filename, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Field", "Information"])
        for key, value in data.items():
            writer.writerow([key, value])

if __name__ == "__main__":
    info = extract_paakashala_info()
    save_to_csv(info)
    print("Paakashala – Kanakapura Road information saved to 'paakashala_kanakapura_info.csv'")


Paakashala – Kanakapura Road information saved to 'paakashala_kanakapura_info.csv'
