In [1]:
!pip install geopy




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import random
import plotly.express as px 
import plotly.graph_objects as go

# Import geopy for distance calculations
from geopy.distance import geodesic

## Preprocessing

In [3]:
# Load the dataset and set certain arguments to read the data correctly.
data = pd.read_csv('data.csv')

# Display the first few rows of the dataset
display(data)

Unnamed: 0,name,state,latitude,longitude,category
0,Grand Canyon National Park,Arizona,36.11,-112.11,National Park
1,Yosemite National Park,California,37.87,-119.54,National Park
2,Yellowstone National Park,Wyoming,44.43,-110.59,National Park
3,Zion National Park,Utah,37.3,-113.03,National Park
4,Mount Rushmore,South Dakota,43.88,-103.46,Monument
5,Great Smoky Mountains,Tennessee,35.65,-83.51,Natural Wonder
6,Statue of Liberty,New York,40.69,-74.04,Historical Landmark
7,Arches National Park,Utah,38.73,-109.59,National Park
8,Niagara Falls,New York,43.1,-79.04,Natural Wonder
9,Golden Gate Bridge,California,37.82,-122.48,Landmark


In [4]:
# Determining the size of the DataFrame
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns.")

The DataFrame has 10 rows and 5 columns.


In [5]:
# Display informative summary of the DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       10 non-null     object 
 1   state      10 non-null     object 
 2   latitude   10 non-null     float64
 3   longitude  10 non-null     float64
 4   category   10 non-null     object 
dtypes: float64(2), object(3)
memory usage: 532.0+ bytes


In [6]:
# Display descriptive statistics of the DataFrame
display(data.describe())

Unnamed: 0,latitude,longitude
count,10.0,10.0
mean,39.558,-102.739
std,3.249933,17.415909
min,35.65,-122.48
25%,37.43,-112.8
50%,38.3,-110.09
75%,42.4975,-88.4975
max,44.43,-74.04


In [7]:
# Extracting relevant columns for location data
locations = data[['name', 'latitude', 'longitude']]

### Baseline Route

In [8]:
# Create random route
random_route = locations.sample(frac=1, random_state=7).reset_index(drop=True)

In [9]:
# Compute total distance
def total_distance(route):
    distance = 0
    for i in range(len(route) - 1):
        start = (route.loc[i]['latitude'], route.loc[i]['longitude'])
        end = (route.loc[i + 1]['latitude'], route.loc[i + 1]['longitude'])
        distance += geodesic(start, end).km
    return distance

baseline_distance = total_distance(random_route)
print(f"Baseline Distance(km): {baseline_distance:.0f} km")

Baseline Distance(km): 13005 km


### Optimized Route

In [10]:
def total_distance(route):
    distance = 0
    for i in range(len(route) - 1):
        start = (route.iloc[i]['latitude'], route.iloc[i]['longitude'])
        end = (route.iloc[i + 1]['latitude'], route.iloc[i + 1]['longitude'])
        distance += geodesic(start, end).km
    return distance

def nearest_neighbor(locations):
    start_time = time.time()

    unvisited = locations.copy().reset_index(drop=True)
    route = []

    current_location = unvisited.iloc[0]
    route.append(current_location)
    unvisited = unvisited.drop(index=0).reset_index(drop=True)

    while not unvisited.empty:
        min_distance = float('inf')
        nearest_index = None

        for index, location in unvisited.iterrows():
            dist = geodesic(
                (current_location['latitude'], current_location['longitude']),
                (location['latitude'], location['longitude'])
            ).km
            if dist < min_distance:
                min_distance = dist
                nearest_index = index

        current_location = unvisited.iloc[nearest_index]
        route.append(current_location)
        unvisited = unvisited.drop(index=nearest_index).reset_index(drop=True)

    route_df = pd.DataFrame(route)
    duration = time.time() - start_time
    return route_df, total_distance(route_df), duration

# Run it
route, distance, duration = nearest_neighbor(locations)

print(f"Optimized Distance: {distance:.0f} km")
print(f"Time taken: {duration:.2f} seconds")


Optimized Distance: 8344 km
Time taken: 0.04 seconds


Loaded CSV and pulls out the latitude/longitude columns as a simple list of (lat, lon) pairs.

In [11]:
# Extract latitude & longitude into a list of tuples
locs = list(zip(data['latitude'], data['longitude']))

Code picks a random order of locations, then calculates and prints the total distance of random route.

In [12]:
# Create a random route
random.seed(1)                       
route_rand = locs.copy()     
random.shuffle(route_rand)

# Calculate the distance of the random route
rand_km = sum(geodesic(route_rand[i], route_rand[i+1]).km
    for i in range(len(route_rand) - 1))
print(f"Random route ≈ {rand_km:.0f} km")

Random route ≈ 14130 km


Implements a simple 'nearest‑neighbor' tour, that start at the first point, repeatedly go to the closest unvisited location, then sum and print the total distance of that route.

In [13]:
# Nearest-neighbor algorithm
unvisited = locs.copy()
route_nn = [unvisited.pop(0)]

while unvisited: 
    last_point = route_nn[-1] 
    idx = min(
        range(len(unvisited)),
        key=lambda i: geodesic(last_point, unvisited[i]).km
    )
    route_nn.append(unvisited.pop(idx))

nn_km = sum(geodesic(route_nn[i], route_nn[i+1]).km
    for i in range(len(route_nn) - 1))
print(f"Nearest‐neighbor ≈ {nn_km:.0f} km")  

Nearest‐neighbor ≈ 8344 km


## Data Visualization

In [14]:
# Calculate leg distances for bar chart
leg_dists = []
for i in range(len(route) - 1):
    start = (route.iloc[i]['latitude'], route.iloc[i]['longitude'])
    end = (route.iloc[i + 1]['latitude'], route.iloc[i + 1]['longitude'])
    d = geodesic(start, end).km
    leg_dists.append({'Leg': f"{route.iloc[i]['name']} → {route.iloc[i+1]['name']}", 'Distance_km': d})

# Create a DataFrame for leg distances
leg_dists_df = pd.DataFrame(leg_dists)

In [15]:
#  Map Visualization
map_fig = go.Figure()

# Markers for all locations
map_fig.add_trace(go.Scattermapbox(
    lat=locations['latitude'],
    lon=locations['longitude'],
    mode='markers',
    marker=dict(size=8, color='blue'),
    text=locations['name'],
    name='Locations'
))

# Optimized route
map_fig.add_trace(go.Scattermapbox(
    lat=route['latitude'].tolist() + [route['latitude'].iloc[0]],
    lon=route['longitude'].tolist() + [route['longitude'].iloc[0]],
    mode='lines+markers',
    marker=dict(size=6, color='red'),
    line=dict(width=2, color='red'),
    name='Optimized Route'
))

# Random route
map_fig.add_trace(go.Scattermapbox(
    lat=random_route['latitude'].tolist() + [random_route['latitude'].iloc[0]],
    lon=random_route['longitude'].tolist() + [random_route['longitude'].iloc[0]],
    mode='lines+markers',
    marker=dict(size=6, color='green'),
    line=dict(width=2, color='green'),
    name='Random Route'
))

map_fig.update_layout(
    mapbox_style='open-street-map',
    mapbox_center={"lat": locations['latitude'].mean(), "lon": locations['longitude'].mean()},
    mapbox_zoom=4,
    margin=dict(t=40, b=40),
    title=f"Routes Comparison<br><sub>Initial: {baseline_distance:.2f} km | Optimized: {distance:.2f} km | Time: {duration:.2f} sec</sub>"
)
map_fig.show()


The graph above is a comparision between two travel routes for the selected top 10 landmarks and natural attractions on a map of the United States -- a randomly selected route and an optimized route (smallest distance route) between those 10 places.

The route comparision visualization clearly demonstrates the significant impact of optimization on travel distance.
From two routes compared:
- A randomly generated route covered a total distance of 13,005.42 km.
- An optimized route minimized the travel distance to 8,344.50 km. 

This resulted in reduction of approximately 4660 km reflecting an efficiancy gain of over 35%. The optimization process was completed in just 0.04 seconds, indicating the use of a highly efficient algorithm suitable for real-time applications and showcasing how route optimization can be useful for saving time, fuel, and money in real-life travel or delivery planning. 

In [16]:
# Bar Chart (Distance per Leg)

bar_fig = px.bar(
    leg_dists_df,
    x='Leg',
    y='Distance_km',
    color='Distance_km',
    color_continuous_scale=px.colors.sequential.Oranges,
    hover_data={'Distance_km': ':.1f'},
    title='Optimized Route: Distance per Leg',
    labels={'Distance_km': 'Distance (km)'},
).update_layout(
    xaxis_tickangle=-45, 
    margin=dict(t=40, b=150),
    )
bar_fig.show()


The bar graph above shows the distance between each stop in the optimized travel route. Here the X-axis represents each stop of the trip, while the Y-axis shows the distance in kilometers. The bars are color-coded according to distance, with lighter colors indicating shorter stops and darkers shades indicating longer distances. This makes it easy to visually spot which parts of the journey are the longest. 

Key Observations
- The longest leg in the optimized route is between Golden Gate Bridge → Great Smoky Mountains, covering over 3,000 km. This is also visually confirmed by the darkest color bar.
- The shortest leg is between Grand Canyon National Park → Zion National Park, with a distance under 300 km.
- Most legs fall in the 500–1500 km range, which reflects efficient routing between nearby landmarks.
- Only one leg (Golden Gate to Great Smoky Mountains) stands out as a long-distance jump, which likely couldn’t be avoided given the geographic locations.

Conclusions

The bar chart shows how distance is distributed across each leg of the optimized route. It highlights that while most stops are relatively close to one another, certain legs are significantly longer, possibly due to unavoidable geographic separation.

This kind of analysis is helpful in planning rest stops, fuel usage, and time allocation. It reinforces the importance of optimization in managing long-distance travel effectively — especially when some destinations are far apart.

In [17]:
# Donut Chart (Share per Leg)
fig_donut = go.Figure(go.Pie(
    labels=leg_dists_df['Leg'],
    values=leg_dists_df['Distance_km'],
    hole=0.4,
    sort=False,
))
fig_donut.update_layout(
    title="Distance Share per Leg",
    margin=dict(l=20, r=20, t=40, b=20)
)
fig_donut.show()



The above pie-chart represents the precentage of total travel distance each segment(leg) contributes to the optimized travel route. Each segment is labeled and color coded to show how much of the total journey it take up. This helps visualize which legs are the longest and how the distance is distributed across the entire route. 

Key Observations
- The largest share of the total distance (41.5%) comes from the leg Golden Gate Bridge → Great Smoky Mountains, confirming it as the longest and most significant stretch of the trip.

- The shortest leg, Grand Canyon National Park → Zion National Park, contributes only 1.86% of the total distance.

- Most other legs fall between 4% and 18%, showing that the rest of the trip is more evenly spread out in terms of distance.

- The second-largest leg is Mount Rushmore → Yosemite National Park, which accounts for 18.1% of the total distance.

Conclusion

This chart clearly shows that the overall travel distance is not evenly distributed. While most legs cover moderate distances, a single leg (Golden Gate to Great Smoky Mountains) accounts for nearly half the total journey. This emphasizes how a single long stretch can significantly impact the total route length.

Understanding distance share helps in budgeting, fuel planning, and time management, especially when planning long trips. It also reinforces the value of optimized route planning to minimize the impact of longer unavoidable legs.

In [18]:
# Cumulative Distance Chart
cumulative = np.cumsum(leg_dists_df['Distance_km'])
fig_cumulative = go.Figure()
fig_cumulative.add_trace(go.Scatter(
    x=leg_dists_df['Leg'],
    y=cumulative,
    mode="lines+markers",
    line=dict(width=3, color='orange'),
    marker=dict(size=8)
))
fig_cumulative.update_layout(
    title="Cumulative Distance by Stop",
    xaxis_title="Leg",
    yaxis_title="Distance (km)",
    margin=dict(l=40, r=20, t=40, b=40))

fig_cumulative.show()


The above line graph shows the growing total distance (km) traveled as a trip progresses through multiple US landmards and national parks. 

Key Observations
- The journey begins at Grand Canyon National Park with an initial cumulative distance near 0 km.
- There’s a steady increase in cumulative distance as the route continues through Zion, Arches, and Yellowstone National Parks. The climb in distance here is consistent but moderate.
- A noticeable steep rise is observed between Yosemite National Park and the Golden Gate Bridge, indicating a significant geographical leap. This is the largest increase between any two points.
- After reaching Golden Gate Bridge, the route continues to Great Smoky Mountains, Niagara Falls, and finally the Statue of Liberty, with the total distance approaching around 8300 km.
- These final legs maintain a higher base distance but with relatively smaller increases compared to the Yosemite–Golden Gate segment.

Conclusion:
The cumulative distance graph highlights a scenic cross-country journey that starts in the western U.S. and ends in the northeast. While most travel segments build distance gradually, the leg from Yosemite to Golden Gate Bridge stands out as a major jump—suggesting a shift from inland to coastal travel. Overall, the graph effectively tracks how each stop adds to the journey, culminating in a total distance of over 8000 km, showcasing the vast geographic spread and diversity of the route.
