In [1]:
!pip install geopy




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, roc_curve, auc
import time

# Basline Model
from geopy.distance import geodesic
from sklearn.linear_model import LogisticRegression

# Tree-Based Models
from sklearn.ensemble import RandomForestClassifier

# Gradient Boosting Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Scaling (for Logistic Regression)
from sklearn.preprocessing import StandardScaler

In [3]:
# load the dataset and set certain arguments to read the data correctly.
data = pd.read_csv('data.csv')

# Display the first few rows of the dataset
display(data)

Unnamed: 0,name,state,latitude,longitude,category
0,Grand Canyon National Park,Arizona,36.11,-112.11,National Park
1,Yosemite National Park,California,37.87,-119.54,National Park
2,Yellowstone National Park,Wyoming,44.43,-110.59,National Park
3,Zion National Park,Utah,37.3,-113.03,National Park
4,Mount Rushmore,South Dakota,43.88,-103.46,Monument
5,Great Smoky Mountains,Tennessee,35.65,-83.51,Natural Wonder
6,Statue of Liberty,New York,40.69,-74.04,Historical Landmark
7,Arches National Park,Utah,38.73,-109.59,National Park
8,Niagara Falls,New York,43.1,-79.04,Natural Wonder
9,Golden Gate Bridge,California,37.82,-122.48,Landmark


In [4]:
# Determining the size of the DataFrame
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns.")

The DataFrame has 10 rows and 5 columns.


In [5]:
# Display informative summary of the DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       10 non-null     object 
 1   state      10 non-null     object 
 2   latitude   10 non-null     float64
 3   longitude  10 non-null     float64
 4   category   10 non-null     object 
dtypes: float64(2), object(3)
memory usage: 532.0+ bytes


In [6]:
# Display descriptive statistics of the DataFrame
display(data.describe())

Unnamed: 0,latitude,longitude
count,10.0,10.0
mean,39.558,-102.739
std,3.249933,17.415909
min,35.65,-122.48
25%,37.43,-112.8
50%,38.3,-110.09
75%,42.4975,-88.4975
max,44.43,-74.04


In [7]:
data.columns

Index(['name', 'state', 'latitude', 'longitude', 'category'], dtype='object')

In [8]:
locations = data[['name', 'latitude', 'longitude']]

### Baseline Route

In [None]:
# Create random route
random_route = locations.sample(frac=1, random_state=7).reset_index(drop=True)

In [10]:
# Compute total distance
def total_distance(route):
    distance = 0
    for i in range(len(route) - 1):
        start = (route.loc[i]['latitude'], route.loc[i]['longitude'])
        end = (route.loc[i + 1]['latitude'], route.loc[i + 1]['longitude'])
        distance += geodesic(start, end).km
    return distance

baseline_distance = total_distance(random_route)
print(f"Baseline Distance(km): {baseline_distance:.0f} km")

Baseline Distance(km): 13005 km


Loaded CSV and pulls out the latitude/longitude columns as a simple list of (lat, lon) pairs.

In [None]:
import pandas as pd
import random                                    
from geopy.distance import geodesic

data = pd.read_csv('data.csv')

# Extract latitude & longitude into a list of tuples
locs = list(zip(data['latitude'], data['longitude']))

Code picks a random order of locations, then calculates and prints the total distance of random route.

In [None]:
random.seed(1)                       
route_rand = locs.copy()     
random.shuffle(route_rand)

rand_km = sum(geodesic(route_rand[i], route_rand[i+1]).km
    for i in range(len(route_rand) - 1))
print(f"Random route ≈ {rand_km:.0f} km")

Implements a simple 'nearest‑neighbor' tour, that start at the first point, repeatedly go to the closest unvisited location, then sum and print the total distance of that route.

In [None]:
unvisited = locs.copy()
route_nn = [unvisited.pop(0)]

while unvisited: 
    last_point = route_nn[-1] 
    idx = min(
        range(len(unvisited)),
        key=lambda i: geodesic(last_point, unvisited[i]).km
    )
    route_nn.append(unvisited.pop(idx))

nn_km = sum(geodesic(route_nn[i], route_nn[i+1]).km
    for i in range(len(route_nn) - 1))
print(f"Nearest‐neighbor ≈ {nn_km:.0f} km")  

### Optimized Route

This tiny snippet lets you calculate the total length.

In [11]:
# Function to calculate the total distance of a route
def total_distance(route):
    distance = 0
    for i in range(len(route) - 1):
        start = (route.iloc[i]['latitude'], route.iloc[i]['longitude'])
        end = (route.iloc[i + 1]['latitude'], route.iloc[i + 1]['longitude'])
        distance += geodesic(start, end).km
    return distance

In [12]:
# Function to calculate the optimized route using the nearest neighbor algorithm
def nearest_neighbor(locations):
    start_time = time.time()

    unvisited = locations.copy().reset_index(drop=True)
    route = []

    current_location = unvisited.iloc[0]
    route.append(current_location)
    unvisited = unvisited.drop(index=0).reset_index(drop=True)

    while not unvisited.empty:
        min_distance = float('inf')
        nearest_index = None

        for index, location in unvisited.iterrows():
            dist = geodesic(
                (current_location['latitude'], current_location['longitude']),
                (location['latitude'], location['longitude'])
            ).km
            if dist < min_distance:
                min_distance = dist
                nearest_index = index

        current_location = unvisited.iloc[nearest_index]
        route.append(current_location)
        unvisited = unvisited.drop(index=nearest_index).reset_index(drop=True)

    route_df = pd.DataFrame(route)
    duration = time.time() - start_time
    return route_df, total_distance(route_df), duration

# Calculate the optimized route using the nearest neighbor algorithm
route, distance, duration = nearest_neighbor(locations)

print(f"Optimized Distance: {distance:.0f} km")
print(f"Time taken: {duration:.2f} seconds")


Optimized Distance: 8344 km
Time taken: 0.02 seconds


from geopy.distance import geodesic

def route_distance(route):
    return sum(geodesic(route[i], route[i+1]).km
        for i in range(len(route) - 1))