In [2]:
import pandas as pd
import requests
from difflib import SequenceMatcher

def fuzzy_match(address1, address2):
    # Calculate similarity score between two addresses using SequenceMatcher
    return SequenceMatcher(None, address1.lower(), address2.lower()).ratio()

def get_most_relevant_business(address, api_key, initial_radius=20, max_radius=5000, min_results=5):
    # Use Geocoding API to get latitude and longitude for the address
    geocoding_endpoint = "https://maps.googleapis.com/maps/api/geocode/json"
    geocoding_params = {
        "address": address,
        "key": api_key
    }
    geocoding_response = requests.get(geocoding_endpoint, params=geocoding_params)
    geocoding_data = geocoding_response.json()
    
    if geocoding_response.status_code == 200 and geocoding_data.get('results'):
        input_address = geocoding_data['results'][0]['formatted_address']
        location = geocoding_data['results'][0]['geometry']['location']
        
        # Initialize search radius
        search_radius = initial_radius
        
        while True:
            # Use Nearby Search API to find businesses near the specified location
            nearby_search_endpoint = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
            nearby_search_params = {
                "location": f"{location['lat']},{location['lng']}",
                "radius": search_radius,
                "key": api_key
            }
            response = requests.get(nearby_search_endpoint, params=nearby_search_params)
            data = response.json()
            
            if response.status_code == 200 and data.get('results'):
                # Adjust search radius based on number of results
                num_results = len(data['results'])
                if num_results < min_results and search_radius < max_radius:
                    search_radius *= 2  # Double the radius
                elif num_results > min_results:
                    break  # Sufficient number of results, exit loop
                else:
                    break  # Maximum radius reached without enough results, exit loop
            
        max_score = 0.6 #minimum match score
        most_relevant_business = None
        for result in data['results']:
            formatted_address = result.get('vicinity', '').lower()
            # Exclude results with formatted address exactly "Chicago"
            if formatted_address != "chicago":
                match_score = fuzzy_match(input_address, formatted_address)
                if match_score > max_score:
                    max_score = match_score
                    most_relevant_business = {
                        "name": result.get('name'),
                        "formatted_address": formatted_address,
                        "types": result.get('types'),
                        "place_id": result.get('place_id'),
                        "match_score": match_score
                    }
        return most_relevant_business
    return None

def process_addresses(input_csv, output_csv, api_key, chunksize=100):
    # Read input CSV file in chunks
    chunk_iter = pd.read_csv(input_csv, chunksize=chunksize)
    
    # Initialize an empty list to store processed chunks
    processed_chunks = []
    
    # Process each chunk
    for chunk in chunk_iter:
        # Add new columns for business information
        chunk['Name'] = ""
        chunk['Formatted Address'] = ""
        chunk['Types'] = ""
        chunk['Match Score'] = ""
        
        # Process each address in the chunk
        for index, row in chunk.iterrows():
            address = row['Address']
            business = get_most_relevant_business(address, api_key)
            if business:
                # Update chunk with business information
                chunk.at[index, 'Name'] = business.get('name', '')
                chunk.at[index, 'Formatted Address'] = business.get('formatted_address', '')
                chunk.at[index, 'Types'] = ', '.join(business.get('types', []))
                chunk.at[index, 'Match Score'] = round(business.get('match_score', 0), 2)  # Round match score to 2 decimal places
        
        # Append the processed chunk to the list
        processed_chunks.append(chunk)
    
    # Concatenate all processed chunks into a single DataFrame
    df = pd.concat(processed_chunks, ignore_index=True)
    
    # Save DataFrame with updated information to output CSV file
    df.to_csv(output_csv, index=False)

# Example usage
input_csv = "mcmftestv2.csv"
output_csv = "mcmfoutputv2.csv"
api_key = "AIzaSyB-3J6J6A4LL6IjnqEStMCnterQpmxT844"
process_addresses(input_csv, output_csv, api_key, chunksize=100)


KeyboardInterrupt: 