In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

## DataCollection:

In [29]:
#URL for Yellow Pages
base_url = "https://www.yellowpages.com"
search_url = "https://www.yellowpages.com/search?search_terms=Indian+Restaurants&geo_location_terms=INDIA"

# Send HTTP request to the search page
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Extract data from the search results
businesses = []
results = soup.find_all("div", class_="result")

for result in results:
    # Business Name
    name_tag = result.find("a", class_="business-name")
    name = name_tag.text.strip() if name_tag else None

    # Phone Number
    phone_tag = result.find("div", class_="phones")
    phone = phone_tag.text.strip() if phone_tag else None

    # Category
    category_tag = result.find("div", class_="categories")
    category = category_tag.text.strip() if category_tag else None

    # Website Link
    website_tag = result.find("a", class_="track-visit-website")
    website = website_tag['href'] if website_tag else None

    # Details Page Link
    details_link = base_url + name_tag['href'] if name_tag and 'href' in name_tag.attrs else None

    # Add to list for further processing
    businesses.append({
        "Name": name,
        "Phone": phone,
        "Category": category,
        "Website": website,
        "Details_Link": details_link
    })

# Step 2: Visit each details page to extract address, hours, and places near
for business in businesses:
    if business["Details_Link"]:
        try:
            details_response = requests.get(business["Details_Link"], headers=headers)
            details_soup = BeautifulSoup(details_response.text, "html.parser")

            # Extract Address
            address_tag = details_soup.find("span", class_="address")
            address = address_tag.text.strip() if address_tag else None
            business["Address"] = address

            # Extract Hours
            hours_section = details_soup.find("section", id="aside-hours")
            if hours_section:
                hours_table = hours_section.find("table")
                hours = "\n".join([
                    f"{row.find('th').text.strip()}: {row.find('td').text.strip()}"
                    for row in hours_table.find_all("tr")
                ]) if hours_table else None
            else:
                hours = None
            business["Hours"] = hours

            # Extract Places Near
            places_near_section = details_soup.find("section", class_="cross-links")
            if places_near_section:
                places = places_near_section.find_all("li")
                places_near = ", ".join([
                    place.text.strip() for place in places
                ]) if places else None
            else:
                places_near = None
            business["Places_Near"] = places_near

            # Wait to prevent getting blocked
            time.sleep(1)

        except Exception as e:
            print(f"Error fetching details for {business['Name']}: {e}")
            business["Address"] = None
            business["Hours"] = None
            business["Places_Near"] = None
    else:
        business["Address"] = None
        business["Hours"] = None
        business["Places_Near"] = None

In [30]:
# Convert data to a DataFrame and save to CSV
df = pd.DataFrame(businesses)
df.to_csv("yellowpages_indian_restaurants_with_hours_places.csv", index=False)

print("Scraping complete. Data saved to 'yellowpages_indian_restaurants_with_hours_places.csv'")

Scraping complete. Data saved to 'yellowpages_indian_restaurants_with_hours_places.csv'


In [31]:
data = pd.read_csv("yellowpages_indian_restaurants_with_hours_places.csv")
data.head()

Unnamed: 0,Name,Phone,Category,Website,Details_Link,Address,Hours,Places_Near
0,Indian Kitchen,(760) 610-2407,Indian RestaurantsRestaurants,https://indiankitchenindianwells.com,https://www.yellowpages.com/indian-wells-ca/mi...,"74901 Us Highway 111Indian Wells, CA 92210",Tue:: 5:00 am - 10:00 pm\nWed:: 12:00 am - 10:...,"Palm Desert (1 miles), Rancho Mirage (5 miles)..."
1,Indian Wells Resort Dining & Entertainment,(760) 797-8700,Indian RestaurantsRestaurants,https://indianwellsresort.com/dining-entertain...,https://www.yellowpages.com/indian-wells-ca/mi...,"76661 Us Highway 111Indian Wells, CA 92210",Mon - Sun:: Open 24 Hours,"Palm Desert (2 miles), Rancho Mirage (6 miles)..."
2,Naan House,(760) 321-2266,Indian RestaurantsRestaurantsCaterers,,https://www.yellowpages.com/rancho-mirage-ca/m...,"72817 Dinah Shore Dr Ste 103Rancho Mirage, CA ...",Mon - Sun:: 11:00 am - 9:00 pm,"Thousand Palms (2 miles), Cathedral City (7 mi..."
3,India Oven,(760) 770-3918,Indian RestaurantsMiddle Eastern RestaurantsRe...,https://www.indiaovenps.com,https://www.yellowpages.com/cathedral-city-ca/...,"35875 Date Palm DrCathedral City, CA 92234",Mon - Wed:: 11:00 am - 9:00 pm\nFri - Sun:: 11...,"Rancho Mirage (7 miles), Thousand Palms (7 mil..."
4,Monsoon,(760) 325-2700,Indian RestaurantsAsian RestaurantsRestaurants,http://monsoonindiancuisine.com,https://www.yellowpages.com/palm-springs-ca/mi...,"555 S Sunrise Way Ste 107Palm Springs, CA 92264",Mon - Thu:: 11:30 am - 9:30 pm\nFri - Sat:: 11...,"Cathedral City (7 miles), North Palm Springs (..."


## Data Processing andCleaning:

In [32]:
print("Initial Data Overview:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDuplicate Rows:")
print(df.duplicated().sum())

Initial Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          9 non-null      object
 1   Phone         9 non-null      object
 2   Category      9 non-null      object
 3   Website       6 non-null      object
 4   Details_Link  9 non-null      object
 5   Address       9 non-null      object
 6   Hours         9 non-null      object
 7   Places_Near   9 non-null      object
dtypes: object(8)
memory usage: 708.0+ bytes
None

Missing Values:
Name            0
Phone           0
Category        0
Website         3
Details_Link    0
Address         0
Hours           0
Places_Near     0
dtype: int64

Duplicate Rows:
0


#### Handle missing

In [33]:
df.drop(columns=['Website','Details_Link'], inplace=True)

#### Handle duplicate

In [34]:
df.drop_duplicates(subset=['Name', 'Phone'], keep='first', inplace=True)

#### Fixing Inconsistencies

In [35]:
df['Phone'] = df['Phone'].str.replace(r"\s+", "", regex=True)
df['Phone'] = df['Phone'].apply(lambda x: '+1 ' + x if not x.startswith('+') else x)

# Strip leading/trailing spaces from Address and Category
df['Address'] = df['Address'].str.strip()
df['Category'] = df['Category'].str.strip()

#### Standardizing the Data

In [36]:
df['Category'] = df['Category'].str.title()
df['Places_Near'] = df['Places_Near'].str.title()

# Reformat 'Hours' for better readability (split each day as a separate entry if needed)
df['Hours'] = df['Hours'].str.replace(r"\\n", ", ", regex=True)  # Replace newline character with a comma

# Step 7: Save Cleaned Data
output_file = "yellowpages_indian_restaurants_cleaned.csv"
df.to_csv(output_file, index=False)

print(f"\nCleaned data saved to {output_file}")


Cleaned data saved to yellowpages_indian_restaurants_cleaned.csv


In [37]:
data = pd.read_csv("yellowpages_indian_restaurants_cleaned.csv")
data.head()

Unnamed: 0,Name,Phone,Category,Address,Hours,Places_Near
0,Indian Kitchen,+1 (760)610-2407,Indian Restaurantsrestaurants,"74901 Us Highway 111Indian Wells, CA 92210",Tue:: 5:00 am - 10:00 pm\nWed:: 12:00 am - 10:...,"Palm Desert (1 Miles), Rancho Mirage (5 Miles)..."
1,Indian Wells Resort Dining & Entertainment,+1 (760)797-8700,Indian Restaurantsrestaurants,"76661 Us Highway 111Indian Wells, CA 92210",Mon - Sun:: Open 24 Hours,"Palm Desert (2 Miles), Rancho Mirage (6 Miles)..."
2,Naan House,+1 (760)321-2266,Indian Restaurantsrestaurantscaterers,"72817 Dinah Shore Dr Ste 103Rancho Mirage, CA ...",Mon - Sun:: 11:00 am - 9:00 pm,"Thousand Palms (2 Miles), Cathedral City (7 Mi..."
3,India Oven,+1 (760)770-3918,Indian Restaurantsmiddle Eastern Restaurantsre...,"35875 Date Palm DrCathedral City, CA 92234",Mon - Wed:: 11:00 am - 9:00 pm\nFri - Sun:: 11...,"Rancho Mirage (7 Miles), Thousand Palms (7 Mil..."
4,Monsoon,+1 (760)325-2700,Indian Restaurantsasian Restaurantsrestaurants,"555 S Sunrise Way Ste 107Palm Springs, CA 92264",Mon - Thu:: 11:30 am - 9:30 pm\nFri - Sat:: 11...,"Cathedral City (7 Miles), North Palm Springs (..."
