### Geocoding Company Addresses

In [1]:
# Import required libraries
import pandas as pd

## Loading the dataset
companies_clean = pd.read_csv('deduplicated_companies.csv')

# Drop the specified columns
columns_to_drop = ['name_clean', 'address_clean', 'name_blocks']
companies_clean = companies_clean.drop(columns=[col for col in columns_to_drop if col in companies_clean.columns])

# Display the first few rows to verify
companies_clean.head()

Unnamed: 0,name,address,city,phone,source_url
0,Caroline Garments P/L t/a Caroline Safety,"26 Bon Accord Rd, Westondale, Bulawayo, Zimbabwe",Bulawayo,+263 29 22776452003,https://www.zimbabweyp.com/company/88429/Carol...
1,Pest Portal Zimbabwe (Pvt) Ltd,"Office 11, 3D Building Strathaven Building, Av...",Harare,07725933442007,https://www.zimbabweyp.com/company/104158/Pest...
2,Busy Bee Transcribing,"13 Ascot Road, Avondale West, Harare , Zimbabwe",Harare,0242-3089692012,https://www.zimbabweyp.com/company/106087/Busy...
3,Athol Evans Hospita Home,"P O Box Cr 70 Cranb Cranborne, Harare, Zimbabwe",Harare,570 8875,https://www.zimbabweyp.com/company/14492/Athol...
4,MEGA MARKET,"Target Area, Mutare, Zimbabwe",Mutare,263-20-61517,https://www.zimbabweyp.com/company/4595/MEGA_M...


### Setting the geocoding pipeline

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from arcgis.gis import GIS
from arcgis.geocoding import geocode, reverse_geocode
from arcgis.geometry import Geometry
import folium
import re
import os

# Set up visualizations
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Initialize ArcGIS
ARCGIS_API_KEY = 'bfe3418988754c0fa55e938eea46ab54'
gis = GIS(api_key=ARCGIS_API_KEY)

def clean_address(address):
    """Clean and standardize address strings"""
    if pd.isna(address):
        return ""
    # Remove extra whitespace and standardize
    address = ' '.join(str(address).split())
    # Remove special characters except commas and periods
    address = re.sub(r'[^\w\s,.-]', '', address)
    # Add Zimbabwe if not present
    if 'zimbabwe' not in address.lower():
        address = f"{address}, Zimbabwe"
    return address

def geocode_address(address, max_retries=5):
    """Geocode a single address with retries using ArcGIS"""
    for attempt in range(max_retries):
        try:
            geocode_result = geocode(address)[0]
            return {
                'latitude': geocode_result['location']['y'],
                'longitude': geocode_result['location']['x'],
                'formatted_address': geocode_result['address']
            }
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to geocode {address}: {str(e)}")
            time.sleep(1)  # Wait before retry
    return {'latitude': None, 'longitude': None, 'formatted_address': None}

def reverse_geocode_with_checkpoint(companies_geocoded, batch_size=100, checkpoint_file='geocoding_results/checkpoint.pkl'):
    """Reverse geocode with checkpointing and resume capability"""
    import os
    import pickle
    from tqdm import tqdm

    # Create checkpoint directory if it doesn't exist
    os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)

    # Load previous checkpoint if exists
    if os.path.exists(checkpoint_file):
        print("Found checkpoint. Resuming from last saved point...")
        with open(checkpoint_file, 'rb') as f:
            checkpoint = pickle.load(f)
        start_idx = checkpoint['last_processed'] + 1
        companies_geocoded = checkpoint['data']
    else:
        start_idx = 0
        companies_geocoded['reversed_address'] = None

    # Get the indices of rows that need processing
    mask = companies_geocoded['geocoded'] & companies_geocoded['reversed_address'].isna()
    to_process = companies_geocoded[mask].iloc[start_idx:].index

    if len(to_process) == 0:
        print("No addresses left to process.")
        return companies_geocoded

    print(f"Starting reverse geocoding for {len(to_process)} addresses...")

    try:
        # Process in batches
        for i, idx in enumerate(tqdm(to_process, desc="Reverse Geocoding")):
            row = companies_geocoded.loc[idx]
            companies_geocoded.at[idx, 'reversed_address'] = reverse_geocode_with_checkpoint(
                row['latitude'], row['longitude']
            )

            # Save checkpoint every batch_size records
            if (i + 1) % batch_size == 0:
                checkpoint = {
                    'last_processed': i,
                    'data': companies_geocoded
                }
                with open(checkpoint_file, 'wb') as f:
                    pickle.dump(checkpoint, f)

        # Save final checkpoint
        checkpoint = {
            'last_processed': len(to_process) - 1,
            'data': companies_geocoded
        }
        with open(checkpoint_file, 'wb') as f:
            pickle.dump(checkpoint, f)

        # Remove checkpoint file after successful completion
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)

    except Exception as e:
        print(f"Error during reverse geocoding: {str(e)}")
        print("Progress has been saved. You can resume the process later.")
        raise

    return companies_geocoded

def validate_company_name(original_name, reversed_address):
    """Validate company name in reversed address with fuzzy matching"""
    if not reversed_address or pd.isna(original_name):
        return False

    from rapidfuzz import fuzz
    name = str(original_name).lower()
    address = str(reversed_address).lower()

    # Check for direct inclusion
    if name in address:
        return True

    # Use fuzzy matching with a threshold
    similarity = fuzz.token_set_ratio(name, address)
    return similarity > 60  # Adjust threshold as needed

def plot_geocoding_results(df):
    """Create visualizations of geocoding results"""
    # Create output directory if it doesn't exist
    os.makedirs('geocoding_results', exist_ok=True)

    # Plot geocoding success rate
    success_rate = df['geocoded'].mean() * 100
    plt.figure(figsize=(10, 6))
    sns.barplot(x=['Success', 'Failed'], y=[success_rate, 100-success_rate])
    plt.title(f'Geocoding Success Rate: {success_rate:.2f}%')
    plt.ylabel('Percentage')
    plt.tight_layout()
    plt.savefig('geocoding_results/geocoding_success_rate.png')
    plt.close()

    # Plot points on a map
    m = folium.Map(location=[-19.0154, 29.1549], zoom_start=6)

    # Add points with popups
    for _, row in df[df['geocoded']].iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=3,
            color='blue',
            fill=True,
            fill_color='blue',
            popup=folium.Popup(f"<b>{row['name']}</b><br>{row['address']}", max_width=300)
        ).add_to(m)

    # Save the map
    m.save('geocoding_results/geocoded_companies_map.html')

def main():
    print("Starting geocoding process...")

    # Load the data
    print("Loading data...")
    try:
        companies_clean = pd.read_csv('cleaned_companies.csv')
        print(f"Loaded {len(companies_clean)} records")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Clean addresses
    print("Cleaning addresses...")
    companies_clean['address_cleaned'] = companies_clean['address'].apply(clean_address)

    # Geocode addresses with progress bar
    print("\nGeocoding addresses...")
    tqdm.pandas(desc="Progress")
    geocoded = companies_clean['address_cleaned'].progress_apply(
        lambda x: geocode_address(x) if pd.notna(x) else {}
    )

    # Combine results
    companies_geocoded = pd.concat([
        companies_clean,
        pd.json_normalize(geocoded)
    ], axis=1)

    # Add geocoding status
    companies_geocoded['geocoded'] = companies_geocoded['latitude'].notna()

    # Save intermediate results
    os.makedirs('geocoding_results', exist_ok=True)
    companies_geocoded.to_csv('geocoding_results/geocoded_companies.csv', index=False)
    print(f"\nGeocoding complete. Success rate: {companies_geocoded['geocoded'].mean()*100:.2f}%")

    # Reverse geocoding for validation
    print("\nStarting reverse geocoding for validation...")
    mask = companies_geocoded['geocoded']
    companies_geocoded['reversed_address'] = None

    # Use tqdm for progress tracking
    for idx, row in tqdm(companies_geocoded[mask].iterrows(), total=mask.sum(), desc="Reverse Geocoding"):
        companies_geocoded.at[idx, 'reversed_address'] = reverse_geocode_with_checkpoint(
            row['latitude'], row['longitude']
        )

    # Validate company names
    print("\nValidating company names...")
    companies_geocoded['name_validation'] = companies_geocoded.apply(
        lambda x: validate_company_name(x['name'], x['reversed_address']) if pd.notna(x['reversed_address']) else False,
        axis=1
    )

    # Calculate accuracy metrics
    validation_accuracy = companies_geocoded['name_validation'].mean() * 100

    # Save final results
    companies_geocoded.to_csv('geocoding_results/validated_geocoded_companies.csv', index=False)

    # Generate visualizations
    print("\nGenerating visualizations...")
    plot_geocoding_results(companies_geocoded)

    # Print summary statistics
    print("\n=== Geocoding Summary ===")
    print(f"Total companies: {len(companies_geocoded)}")
    print(f"Successfully geocoded: {companies_geocoded['geocoded'].sum()}")
    print(f"Geocoding success rate: {companies_geocoded['geocoded'].mean()*100:.2f}%")
    print(f"Validation success rate: {validation_accuracy:.2f}%")

    # Show sample of results
    print("\nSample of geocoded companies:")
    print(companies_geocoded[['name', 'address', 'latitude', 'longitude', 'name_validation']].head())

    return companies_geocoded

if __name__ == "__main__":
    # Install required packages if not already installed
    try:
        import arcgis
    except ImportError:
        print("Installing required packages...")
        import sys
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "arcgis", "rapidfuzz"])

    # Run the main function
    print("Starting geocoding process with ArcGIS...")
    geocoded_companies = main()
    print("\nProcess completed! Check the 'geocoding_results' folder for outputs.")

Starting geocoding process with ArcGIS...
Starting geocoding process...
Loading data...
Loaded 31374 records
Cleaning addresses...

Geocoding addresses...


Progress:   0%|          | 19/31374 [02:24<10:02:46,  1.15s/it]