# 1. Import libraries & define global variables

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

CSV_FILE = "addresses.csv"
TABLE_NAME = "addresses"
ENGINE = create_engine('sqlite:///addresses.db')

# 2. Create helper classes & functions

In [2]:
def haversine_km(lat1, lon1, lat2, lon2, R=6371.0):
    """
    Calculate the great circle distance between two points on Earth using the Haversine formula.

    The Haversine formula determines the shortest distance over the earth's surface,
    giving an 'as-the-crow-flies' distance between the points (ignoring any hills, valleys, etc.).

    Parameters:
    -----------
    lat1 : float or np.array
        Latitude of the first point in decimal degrees
    lon1 : float or np.array
        Longitude of the first point in decimal degrees
    lat2 : float or np.array
        Latitude of the second point(s) in decimal degrees
    lon2 : float or np.array
        Longitude of the second point(s) in decimal degrees
    R : float, optional (default=6371.0)
        Radius of Earth in kilometers. Use 3956.0 for miles.

    Returns:
    --------
    float or np.array
        Distance(s) in kilometers between the point(s)

    Example:
    --------
    >>> haversine_km(43.6532, -79.3832, 40.7128, -74.0060)  # Toronto to NYC
    551.78

    Notes:
    ------
    Formula: a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
             c = 2 ⋅ atan2(√a, √(1−a))
             d = R ⋅ c
    where φ is latitude, λ is longitude, R is earth's radius
    """
    # Convert from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2.astype(float), lon2.astype(float)])

    # Haversine formula
    a = np.sin((lat2-lat1)/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin((lon2-lon1)/2)**2

    # Calculate distance
    return 2*R*np.arcsin(np.sqrt(a))


# 3. Import data

In [3]:
# Filename: "addresses.csv"
df = pd.read_csv(CSV_FILE)
df.to_sql(TABLE_NAME, ENGINE, if_exists="replace", index=False)

with ENGINE.connect() as c:
    data_preview = pd.read_sql(text(f"SELECT * FROM {TABLE_NAME} LIMIT 5;"), c)

print(data_preview)

   id                   name  category                       address  \
0   1  University of Toronto       hub  700 Example Ave, Toronto, ON   
1   2            Customer_00  customer  100 Example Ave, Toronto, ON   
2   3            Customer_01  customer      10 Maple St, Toronto, ON   
3   4            Customer_02  customer      15 Maple St, Toronto, ON   
4   5            Customer_03  customer        20 Oak St, Toronto, ON   

    latitude  longitude  
0  43.659016 -79.391191  
1  43.657028 -79.401253  
2  43.660500 -79.395000  
3  43.655800 -79.392000  
4  43.661200 -79.399500  


# 4. Feature engineering: calculate distance from University of Toronto (first row of dataset)

In [4]:
with ENGINE.connect() as c:
    base = pd.read_sql(text(f"SELECT latitude, longitude FROM {TABLE_NAME} LIMIT 1;"), c)
    df = pd.read_sql(text(f"SELECT * FROM {TABLE_NAME};"), c)

df["distance_to_uoft_km"] = haversine_km(base.latitude[0], base.longitude[0], df.latitude, df.longitude)
df.to_sql(TABLE_NAME, ENGINE, if_exists="replace", index=False)

print(df[["latitude","longitude","distance_to_uoft_km"]].head())

    latitude  longitude  distance_to_uoft_km
0  43.659016 -79.391191             0.000000
1  43.657028 -79.401253             0.839095
2  43.660500 -79.395000             0.348020
3  43.655800 -79.392000             0.363477
4  43.661200 -79.399500             0.711157
