In [3]:
import geopandas as gpd
from shapely.geometry import box
from shapely.ops import unary_union
import itertools
import threading
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import cProfile
import pstats
from io import StringIO
import time
import os
import csv
import requests
import geopandas as gpd
from shapely.geometry import box
import requests
from shapely.ops import unary_union
import pandas as pd
import json
import time
from geopy.distance import geodesic

In [4]:
def load_api_keys(file_path):
    """
    Load API keys from a file into a list.
    :param file_path: Path to the API keys file.
    :return: A list of API keys.
    """
    api_keys = []
    try:
        with open(file_path, "r") as file:
            for line in file:
                # Split the line at the '#' character and take the part before it
                api_key = line.split('#')[0].strip()
                if api_key:  # Avoid adding empty lines
                    api_keys.append(api_key)
    except FileNotFoundError:
        print(f"Error: API keys file '{file_path}' not found.")
    except Exception as e:
        print(f"Error loading API keys: {e}")
    return api_keys

In [None]:
# Path to your API keys file
api_keys_file = r"C:\Users\Hewan Shrestha\Desktop\google_poi_search\data\api_keys.txt"

# Load the API keys
api_keys = load_api_keys(api_keys_file)

# Create an iterator to cycle through the API keys
api_key_iterator = itertools.cycle(api_keys)

# Dictionary to track API request counts
api_request_counts = {key: 0 for key in api_keys}

# Lock for thread-safe operations
api_lock = threading.Lock()

# Function to increment API request count
def increment_api_request_count(api_key):
    with api_lock:
        api_request_counts[api_key] += 1

# Function to get the next API key
def get_next_api_key():
    with api_lock:
        api_key = next(api_key_iterator)
        print(f"************************* \nAPI Key used: {api_key} \n*************************\n")  # Separator with API key used
        return api_key

In [None]:
output_dir = r"C:\Users\Hewan Shrestha\Desktop\detect-car-in-LR-satellite-images\Google_Places\new_data_collection_google_poi_to_outscraper\working_folder\csvs\saarbrucken_new"
# File path to save POIs
output_file = os.path.join(output_dir, "multithreading_saarbrucken_results.csv")

# Create the CSV file and write the header
if not os.path.exists(output_file):
    with open(output_file, "w", newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["place_id", "Type"])

In [None]:
# Function to save POIs to a CSV file
def save_pois_to_csv(pois, poi_type):
    with api_lock:  # Ensure thread-safe writes
        with open(output_file, "a", newline='', encoding="utf-8") as f:
            writer = csv.writer(f)
            for poi_id, poi_type in pois:
                writer.writerow([poi_id, poi_type])
        print(f"Saved {len(pois)} POIs for type '{poi_type}'.")



In [None]:
output_json_file = os.path.join(output_dir, "api_usage.json")
# Function to save API usage data to a file
def save_api_usage_to_file(output_json_file):
    with api_lock:
        with open(output_json_file, "w") as f:
            json.dump(api_request_counts, f, indent=4)
    print(f"API usage data saved to {output_json_file}.")

In [None]:
def load_proxies(file_path, username, password):
    proxies = []
    try:
        with open(file_path, "r") as file:
            for line in file:
                proxy = line.strip()
                if proxy:
                    # Format proxy with username and password
                    formatted_proxy = f"http://{username}:{password}@{proxy}"
                    proxies.append(formatted_proxy)
    except FileNotFoundError:
        print(f"Error: Proxy file '{file_path}' not found.")
    except Exception as e:
        print(f"Error loading proxies: {e}")
    return proxies

In [None]:
# Path to your proxies file
proxy_file = r"C:\Users\Hewan Shrestha\Desktop\google_poi_search\data\proxies.txt"

# Define the username and password for proxies
username = "iweber02"
password = "qp9dQbDM"

# Load proxies with authentication
proxy_list = load_proxies(proxy_file, username, password)


In [None]:

def search_pois(rectangle, api_key, poi_type, data, retries=5, delay=1):
    print(f"\nSearching POIs for type '{poi_type}' using API key '{api_key}'.\nBounding box: {rectangle.bounds}")
    min_lng, min_lat, max_lng, max_lat = rectangle.bounds
    
    # Add locationRestriction to the data payload
    data["locationRestriction"] = {
        "rectangle": {
            "low": {"latitude": min_lat, "longitude": min_lng},
            "high": {"latitude": max_lat, "longitude": max_lng}
        }
    }
    url = "https://places.googleapis.com/v1/places:searchText"
    headers = {
        "Content-Type": "application/json",
        "X-Goog-Api-Key": api_key,
        "X-Goog-FieldMask": "places.id"
    }
    
    results = []
    
    for attempt in range(retries):
        try:
            # Use the proxy list (optional)
            proxies = {'https': proxy for proxy in proxy_list}
            print(f"Attempt {attempt+1}, using proxies: {proxies}")
            
            # Make the API request
            response = requests.post(url, headers=headers, json=data, proxies=proxies)
            
            # Increment API request count
            increment_api_request_count(api_key)
            
            if response.status_code == 200:
                # Parse the response for POIs
                for place in response.json().get('places', []):
                    place_id = place['id']
                    results.append((place_id, poi_type))
                return results  # Return results if the request is successful
            
            elif response.status_code == 429:  # Rate limit error
                print(f"Rate limit hit for API key: {api_key}. Switching API key...")
                # Switch to the next API key
                api_key = get_next_api_key()
            
            else:
                print(f"Error occurred: {response.content}")
                break  # Break if the error is not recoverable
            
        except Exception as e:
            print(f"Error during request: {e}")
        
        # Apply exponential backoff
        time.sleep(delay)
        delay *= 2  # Double the delay for the next attempt
    
    print(f"Failed to retrieve POIs for type '{poi_type}' after {retries} attempts.")
    return results


In [None]:
def divide_box(boundary_box, depth, quadrant_number, use_overlap, overlap=0.0006):
    min_lng, min_lat, max_lng, max_lat = boundary_box.bounds
    mid_lng, mid_lat = (min_lng + max_lng) / 2, (min_lat + max_lat) / 2

    if use_overlap:
        # Create quadrants with overlap
        quadrants = [
            box(min_lng, min_lat, mid_lng + overlap, mid_lat + overlap),  # Bottom-left
            box(mid_lng - overlap, min_lat, max_lng, mid_lat + overlap),  # Bottom-right
            box(min_lng, mid_lat - overlap, mid_lng + overlap, max_lat),  # Top-left
            box(mid_lng - overlap, mid_lat - overlap, max_lng, max_lat)   # Top-right
        ]
    else:
        # Create quadrants without overlap
        quadrants = [
            box(min_lng, min_lat, mid_lng, mid_lat),  # Bottom-left
            box(mid_lng, min_lat, max_lng, mid_lat),  # Bottom-right
            box(min_lng, mid_lat, mid_lng, max_lat),  # Top-left
            box(mid_lng, mid_lat, max_lng, max_lat)   # Top-right
        ]
    
    # # Save each quadrant as a GeoJSON file with depth, quadrant number, and boundary coordinates
    # for i, quadrant in enumerate(quadrants):
    #     gdf = gpd.GeoDataFrame(geometry=[quadrant], crs="EPSG:4326")
    #     filename = rf"C:\Users\Hewan Shrestha\Desktop\detect-car-in-LR-satellite-images\Google_Places\new_data_collection_google_poi_to_outscraper\working_folder\csvs\dudweiler_test\geojsons\depth_{depth}_quadrant_{quadrant_number}_{min_lng}_{min_lat}_{max_lng}_{max_lat}_{i + 1}.geojson"
    #     gdf.to_file(filename, driver="GeoJSON")
    #     print(f"Saved {filename} with depth {depth} and quadrant number {quadrant_number}, coordinates: ({min_lng}, {min_lat}), ({max_lng}, {max_lat})")
        
    return quadrants

In [None]:
# Function to get the longest side of a bounding box using geodesic distance
def get_longest_side(bounding_box):
    min_lng, min_lat, max_lng, max_lat = bounding_box.bounds
    
    # Define the four corners of the bounding box
    bottom_left = (min_lat, min_lng)
    bottom_right = (min_lat, max_lng)
    top_left = (max_lat, min_lng)
    top_right = (max_lat, max_lng)

    # Calculate the distances between the four corners
    width = int(geodesic(bottom_left, bottom_right).meters)  # Distance along longitude (width)
    height = int(geodesic(bottom_left, top_left).meters)    # Distance along latitude (height)

    # Find the longest side
    longest_side = max(width, height)

    # Print the longest side length for debugging
    print(f"Longest side length (in meters): {longest_side:.2f}")

    # Return the longest side as an integer
    return int(longest_side)

In [None]:


# Function to recursively search for POIs using a quadtree approach
def quadtree_search(bounding_box, api_key, poi_type, data, threshold, quadrant_number=1, searched_areas=None, overlap=0.0007, depth=0):
    if searched_areas is None:
        searched_areas = set()

    results = set()
    use_overlap = True
    
    # Normalize bounding box coordinates to prevent precision errors
    def normalize_coords(coords, precision=8):
        return tuple(round(c, precision) for c in coords)

    bounding_box_coords = normalize_coords(bounding_box.bounds)
    if bounding_box_coords in searched_areas:
        print(f"Bounding box {bounding_box_coords} has already been searched. Skipping...")
        return results  # Skip this bounding box but continue searching others

    # Get the longest side using geodesic library
    longest_side = get_longest_side(bounding_box)

    # Check for invalid bounding box
    if longest_side == 0:
        print(f"Bounding box {bounding_box_coords} is invalid (longest side = 0.0). Skipping...")
        return results

    # Add bounding box to searched areas
    searched_areas.add(bounding_box_coords)
    print(f"\nSearching depth {depth}, quadrant {quadrant_number}, bounding box {bounding_box_coords}")

    pois = search_pois(bounding_box, api_key, poi_type, data)
    print(f"Number of places found: {len(pois)}")

    # Save POIs to the CSV file
    save_pois_to_csv(pois, poi_type)

    # Update the results set with found POIs
    results.update(pois)

    # If the longest side is less than 10 meters, do not divide further
    if longest_side < 50 and (len(pois) > threshold or len(pois) < threshold):
        print(f"Longest side {longest_side} meters is less than 50. Adding results and not dividing further.")
        return results

    # Determine whether to divide the box based on POIs found and side length
    if len(pois) > threshold:
        # If POIs are above the threshold, divide the bounding box
        if longest_side < 200:
            # If the longest side is smaller than 200 meters, do not use overlap
            use_overlap = False
            print(f"Longest side {longest_side} meters is less than 200, dividing without overlap.")
        else:
            # If the longest side is larger than 200 meters, use overlap
            use_overlap = True
            print(f"Longest side {longest_side} meters is above 200, dividing with overlap.")
        
        quadrants = divide_box(bounding_box, depth=depth, quadrant_number=quadrant_number, use_overlap=use_overlap, overlap=overlap)
        print("*" * 50)
        print("\n" * 2)

        # Process quadrants sequentially at the current depth
        for i, quadrant in enumerate(quadrants):
            print(f"Searching depth {depth+1} at quadrant {i+1}")
            results.update(quadtree_search(quadrant, api_key, poi_type, data, threshold, i+1, searched_areas, overlap, depth+1))

    else:
        print(f"Number of POIs {len(pois)} is below threshold {threshold}. Continuing search.")
    
    # Once all quadrants at the current depth are processed, return the results for this level.
    return results

In [None]:
def process_poi_type(poi_type, rectangle, threshold):
    print(f"\n************************* \nStarting processing for POI type: {poi_type} \n*************************\n")  # Separator for POI type start
    data = {"textQuery": poi_type}
    
    # Get the next API key for the current thread
    api_key = get_next_api_key()
    
    results = quadtree_search(rectangle, api_key, poi_type, data, threshold)
    print(f"\n************************* \nFinished processing for POI type: {poi_type}, found {len(results)} POIs \n*************************\n")  # Separator for POI type end
    return results

In [None]:
# Read the GeoJSON file
city_gdf = gpd.read_file(r"C:\Users\Hewan Shrestha\Desktop\google_poi_search\shapefiles\saarbrucken.geojson")
city_polygon = unary_union(city_gdf['geometry'])

# Create a bounding box for the city polygon
minx, miny, maxx, maxy = city_polygon.bounds
rectangle = box(minx, miny, maxx, maxy)

# Open the text file in read mode to read POI types
with open(r'C:\Users\Hewan Shrestha\Desktop\google_poi_search\data\outscraper_filtered_subtypes.txt', 'r') as file:
    places_of_interest = [line.strip() for line in file.readlines()]

# Define the threshold
threshold = 5

In [None]:
# Start the profiler
pr = cProfile.Profile()
pr.enable()

# Code block for multithreading and processing POIs
start_time_main = time.time()

# Run the search for each POI type in parallel
with ThreadPoolExecutor(max_workers=len(api_keys)) as executor:
    future_to_poi = {executor.submit(process_poi_type, poi_type, rectangle, threshold): poi_type for poi_type in places_of_interest}
    all_results = set()
    
    for future in future_to_poi:
        try:
            results = future.result()
            all_results.update(results)
            print(f"\n************************* \nNumber of POIs found for {future_to_poi[future]}: {len(results)} \n*************************\n")  # Separator for individual POI results
            print("*"*50)
        except Exception as e:
            print(f"\nError processing POI type: {future_to_poi[future]} - {e}\n")

# Final results count
print(f"\n************************* \nTotal unique POIs after all searches: {len(all_results)} \n*************************\n")  # Separator for final count


end_time_main = time.time()
print(f"Total execution time for multithreading: {end_time_main - start_time_main:.2f} seconds")

# End profiling
pr.disable()

In [None]:
# Print profiling report
s = StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
ps.print_stats()

# Save the profiling report to a file (optional)
with open(r'C:\Users\Hewan Shrestha\Desktop\detect-car-in-LR-satellite-images\Google_Places\new_data_collection_google_poi_to_outscraper\working_folder\csvs\saarbrucken_new\profiling_report.txt', 'w') as f:
    f.write(s.getvalue())

print(s.getvalue())


save_api_usage_to_file(output_json_file)