In [100]:
import requests
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def build_overpass_query(place_name, bbox=None):
    overpass_query = """
    [out:json][timeout:25];
    // gather results
    (
      node["name"~"{0}"]{1};
      way["name"~"{0}"]{1};
      relation["name"~"{0}"]{1};
    );
    out body;
    >;
    out skel qt;
    """.format(place_name, bbox if bbox else ";")

    return overpass_query

def run_overpass_query(query):
    overpass_url = "https://overpass-api.de/api/interpreter"
    try:
        response = requests.post(overpass_url, data={'data': query})
        response.raise_for_status()  # Check for HTTP errors

        result = response.json()

        # Check if the response has elements
        elements = result.get('elements', [])
        places_data = []

        if elements:
            # Process the result
            for node in elements:
                # Check if the element has coordinates, an id, and a name
                if 'lat' in node and 'lon' in node and 'id' in node and 'tags' in node and 'name' in node['tags']:
                    place_id = node['id']
                    name = node['tags']['name']
                    latitude = node['lat']
                    longitude = node['lon']
                    places_data.append({'ID': place_id, 'Name': name, 'Latitude': latitude, 'Longitude': longitude})

        return places_data

    except requests.exceptions.RequestException as e:
        print(f"Request Error: {e}")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None

def get_combined_data(place_names, bbox=None):
    combined_data = []

    for place_name in place_names:
        overpass_query = build_overpass_query(place_name, bbox)
        place_data = run_overpass_query(overpass_query)

        if place_data:
            combined_data.extend(place_data)

    return combined_data

# Example usage:
place_names = ["McDonald's", "Starbucks", "Subway"]
bbox = "(37.4667, -79.3167, 40.3000, -74.7833)"  # Replace with the desired bounding box

combined_data = get_combined_data(place_names, bbox)

# Create a DataFrame with ID, Name, X, Y, and Vectorized Name
df = pd.DataFrame(combined_data)
df['X'] = df['Longitude']
df['Y'] = df['Latitude']

# Use LabelEncoder to convert 'Name' to integer values
label_encoder = LabelEncoder()
df['Vectorized Name'] = label_encoder.fit_transform(df['Name'])

# Save only 'X', 'Y', and 'Vectorized Name' columns to a text file without headers
df[['X', 'Y', 'Vectorized Name']].to_csv('query_file.txt', index=False, header=False, sep='\t')

# Count occurrences of each vectorized name
name_counts = df['Vectorized Name'].value_counts()

# Display all names, vectorized names, and counts
for vectorized_name, count in name_counts.items():
    real_name = label_encoder.inverse_transform([vectorized_name])[0]
    print(f"Vectorized Name: {vectorized_name}, Real Name: {real_name}, Count: {count}")

print("Choose which identifiers you want to keep for your clusters and add to keepIdentifiers array");


Vectorized Name: 7, Real Name: Subway, Count: 623
Vectorized Name: 4, Real Name: Starbucks, Count: 537
Vectorized Name: 1, Real Name: McDonald's, Count: 135
Vectorized Name: 2, Real Name: McDonald's (Edgewood Road), Count: 2
Vectorized Name: 5, Real Name: Starbucks Entrance, Count: 2
Vectorized Name: 0, Real Name: Barnes & Noble/Starbucks, Count: 1
Vectorized Name: 3, Real Name: McDonald's (Edgewood on Pulaski Highway/US 40), Count: 1
Vectorized Name: 6, Real Name: Starbucks Reserve, Count: 1
Vectorized Name: 8, Real Name: Town Center Blvd at McDonald's, Count: 1
Vectorized Name: 9, Real Name: Wave Pool Subway, Count: 1
Choose which identifiers you want to keep for your clusters and add to keepIdentifiers array
