<a href="https://colab.research.google.com/github/OlliMulchandani/4501-Project2-Maps/blob/main/CS_4501_Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Create Map of All Location History
---

In [None]:
import json
import pandas as pd
import folium
import numpy as np
from geopy.distance import great_circle

# Load JSON file
with open("location-history.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract relevant data
records = []
for entry in data:
    if "visit" in entry and "topCandidate" in entry["visit"]:
        place = entry["visit"]["topCandidate"]
        if "placeLocation" in place and place["placeLocation"].startswith("geo:"):
            geo_parts = place["placeLocation"].split(":")[1].split(",")  # Extract lat, lng
            latitude, longitude = map(float, geo_parts)

            # Compute time spent (endTime - startTime)
            start_time = pd.to_datetime(entry["startTime"])
            end_time = pd.to_datetime(entry["endTime"])
            duration = (end_time - start_time).total_seconds() / 3600  # Convert to hours

            records.append({"latitude": latitude, "longitude": longitude, "duration": duration})

# Convert to DataFrame
df = pd.DataFrame(records)

# Group locations within 50 meters
grouped_records = []
while not df.empty:
    base = df.iloc[0]  # Take the first location as base
    close_points = df[df.apply(lambda row: great_circle((base["latitude"], base["longitude"]), (row["latitude"], row["longitude"])).meters < 50, axis=1)]

    # Only keep groups with more than one instance
    if len(close_points) > 1:
        # Compute mean location and total duration
        mean_lat = close_points["latitude"].mean()
        mean_lng = close_points["longitude"].mean()
        total_duration = close_points["duration"].sum()

        grouped_records.append({"latitude": mean_lat, "longitude": mean_lng, "duration": total_duration})

    # Drop grouped points from DataFrame
    df = df.drop(close_points.index)

# Convert grouped data back to DataFrame
df_grouped = pd.DataFrame(grouped_records)

# Normalize duration for better visualization
min_size, max_size = 3, 20  # Marker size range
df_grouped["scaled_size"] = np.interp(df_grouped["duration"], (df_grouped["duration"].min(), df_grouped["duration"].max()), (min_size, max_size))

df_grouped["color_intensity"] = np.interp(df_grouped["duration"], (df_grouped["duration"].min(), df_grouped["duration"].max()), (0.3, 1.0))

# Create a map centered on the US
us_center = [37.0902, -95.7129]  # Geographic center of the US
m = folium.Map(location=us_center, zoom_start=4)

# Add scaled markers
for _, row in df_grouped.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=row["scaled_size"],
        color="red",
        fill=True,
        fill_color="red",
        fill_opacity=row["color_intensity"]
    ).add_to(m)

# Save and open map
m.save("location_history_map.html")
print("Interactive map saved as location_history_map.html. Open it in a browser to view.")


Interactive map saved as location_history_map.html. Open it in a browser to view.


Create Map of 20 Most Significant Locations
---

In [None]:
import json
import pandas as pd
import folium
import numpy as np
from geopy.distance import great_circle

# Load JSON file
with open("location-history.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract relevant data
records = []
for entry in data:
    if "visit" in entry and "topCandidate" in entry["visit"]:
        place = entry["visit"]["topCandidate"]
        if "placeLocation" in place and place["placeLocation"].startswith("geo:"):
            geo_parts = place["placeLocation"].split(":")[1].split(",")  # Extract lat, lng
            latitude, longitude = map(float, geo_parts)

            # Compute time spent (endTime - startTime)
            start_time = pd.to_datetime(entry["startTime"])
            end_time = pd.to_datetime(entry["endTime"])
            duration = (end_time - start_time).total_seconds() / 3600  # Convert to hours

            records.append({"latitude": latitude, "longitude": longitude, "duration": duration})

# Convert to DataFrame
df = pd.DataFrame(records)

# Group locations within 50 meters
grouped_records = []
while not df.empty:
    base = df.iloc[0]  # Take the first location as base
    close_points = df[df.apply(lambda row: great_circle((base["latitude"], base["longitude"]), (row["latitude"], row["longitude"])).meters < 50, axis=1)]

    # Only keep groups with more than one instance
    if len(close_points) > 1:
        # Compute mean location and total duration
        mean_lat = close_points["latitude"].mean()
        mean_lng = close_points["longitude"].mean()
        total_duration = close_points["duration"].sum()

        grouped_records.append({"latitude": mean_lat, "longitude": mean_lng, "duration": total_duration})

    # Drop grouped points from DataFrame
    df = df.drop(close_points.index)

# Convert grouped data back to DataFrame
df_grouped = pd.DataFrame(grouped_records)

# Group by location and sum the durations
location_groups = df_grouped.groupby(["latitude", "longitude"]) ["duration"].sum().reset_index()

# Sort by duration (descending) to get the top 20 locations
top_20_locations = location_groups.sort_values(by="duration", ascending=False).head(20)

# Print the top 20 locations and their total time spent
print("Top 20 locations with the most time spent:")
for index, row in top_20_locations.iterrows():
    print(f"Location: ({row['latitude']}, {row['longitude']}), Time Spent: {row['duration']:.2f} hours")

# Optional: Visualize the top 20 locations on the map
for _, row in top_20_locations.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],
        radius=7,
        color="blue",
        fill=True,
        fill_color="blue",
        fill_opacity=0.7
    ).add_to(m)

# Save and open updated map
m.save("location_history_top_20_map.html")
print("Updated interactive map with top 20 locations saved as location_history_top_20_map.html. Open it in a browser to view.")


Top 20 locations with the most time spent:
Location: ([REDACTED]), Time Spent: 602.53 hours
Location: ([REDACTED]), Time Spent: 201.16 hours
Location: ([REDACTED]), Time Spent: 67.50 hours
Location: ([REDACTED]), Time Spent: 30.48 hours
Location: ([REDACTED]), Time Spent: 27.06 hours
Location: ([REDACTED]), Time Spent: 24.87 hours
Location: ([REDACTED]), Time Spent: 19.03 hours
...Updated interactive map with top 20 locations saved as location_history_top_20_map.html. Open it in a browser to view.


K-Nearest Neighbors
---

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import math

# Charlottesville bounding coordinates -- I checked, this is a good boundary
CHARLOTTESVILLE_BOUNDS = {
    'min_lat': 37.95,  # Southern boundary
    'max_lat': 38.15,  # Northern boundary
    'min_lon': -78.60,  # Western boundary
    'max_lon': -78.40   # Eastern boundary
}

# Load JSON file
with open("location-history.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract sequential location data from entries that fall within Charlottesville bounds
records = []
for entry in data:
    if "visit" in entry and "topCandidate" in entry["visit"]:
        place = entry["visit"]["topCandidate"]
        if "placeLocation" in place and place["placeLocation"].startswith("geo:"):
            geo_parts = place["placeLocation"].split(":")[1].split(",")  # Extract lat, lng
            lat, lon = map(float, geo_parts)
            # Apply the Charlottesville filter
            if (CHARLOTTESVILLE_BOUNDS['min_lat'] <= lat <= CHARLOTTESVILLE_BOUNDS['max_lat'] and
                CHARLOTTESVILLE_BOUNDS['min_lon'] <= lon <= CHARLOTTESVILLE_BOUNDS['max_lon']):
                records.append((lat, lon))

# Convert records to DataFrame
df = pd.DataFrame(records, columns=["latitude", "longitude"])

# Prepare the data for training:
# X: All points except the last (current location)
# y: All points except the first (next location)
X = df.iloc[:-1].values
y = df.iloc[1:].values

# Number of iterations (using different random_state values)
num_iterations = 100

# Lists to store metrics for each iteration
cv_mse_list = []
test_mse_list = []
rmse_list = []
distance_miles_list = []
exact_match_accuracy_list = []
THRESHOLD_METERS = 50  # threshold for an "exact" match

for random_state in range(num_iterations):
    # Split data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Train a K-Nearest Neighbors regressor
    knn = KNeighborsRegressor(n_neighbors=5)

    # Perform 5-fold cross-validation on the training set using MSE (negated)
    cv_scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_mse = -cv_scores.mean()
    cv_mse_list.append(cv_mse)

    # Fit the model on the full training set and predict on the test set
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    # Compute the Mean Squared Error (MSE) and RMSE on the test set
    mse = mean_squared_error(y_test, y_pred)
    test_mse_list.append(mse)
    rmse = np.sqrt(mse)
    rmse_list.append(rmse)

    # Estimate predicted distance in miles (using 1 degree ≈ 69 miles)
    distance_miles = rmse * 69
    distance_miles_list.append(distance_miles)

    # Compute the "Exact Match Accuracy" (predicted location within 50 meters of true location)
    # Convert threshold to degrees: latitude conversion is constant; longitude conversion adjusts with latitude.
    lat_threshold = THRESHOLD_METERS / 111000  # approx conversion for latitude
    # For each test sample, adjust longitude threshold based on its latitude
    lon_thresholds = THRESHOLD_METERS / (111000 * np.cos(np.radians(y_test[:, 0])))

    lat_diff = np.abs(y_pred[:, 0] - y_test[:, 0])
    lon_diff = np.abs(y_pred[:, 1] - y_test[:, 1])

    exact_matches = np.sum((lat_diff < lat_threshold) & (lon_diff < lon_thresholds))
    exact_match_accuracy = exact_matches / len(y_test)
    exact_match_accuracy_list.append(exact_match_accuracy)

# Compute average metrics over all iterations
avg_cv_mse = np.mean(cv_mse_list)
avg_test_mse = np.mean(test_mse_list)
avg_rmse = np.mean(rmse_list)
avg_distance_miles = np.mean(distance_miles_list)
avg_exact_match_accuracy = np.mean(exact_match_accuracy_list)

# Output the averaged results
print(f"Average Cross-validation MSE over {num_iterations} iterations: {avg_cv_mse:.6f}")
print(f"Average Test MSE over {num_iterations} iterations: {avg_test_mse:.6f}")
print(f"Average RMSE over {num_iterations} iterations: {avg_rmse:.6f}")
print(f"Average Predicted distance (in miles) over {num_iterations} iterations: {avg_distance_miles:.6f} miles")
print(f"Average Exact Match Accuracy (within 50 meters) over {num_iterations} iterations: {avg_exact_match_accuracy*100:.2f}%")


Average Cross-validation MSE over 100 iterations: 0.000104
Average Test MSE over 100 iterations: 0.000096
Average RMSE over 100 iterations: 0.009589
Average Predicted distance (in miles) over 100 iterations: 0.661655 miles
Average Exact Match Accuracy (within 50 meters) over 100 iterations: 1.36%


Linear Regression
---

In [None]:
import json
import pandas as pd
import numpy as np
from geopy.distance import great_circle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# Charlottesville bounding coordinates -- I checked, this is a good boundary
CHARLOTTESVILLE_BOUNDS = {
    'min_lat': 37.95,  # Southern boundary
    'max_lat': 38.15,  # Northern boundary
    'min_lon': -78.60,  # Western boundary
    'max_lon': -78.40   # Eastern boundary
}

# Load JSON file
with open("location-history.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract relevant data from JSON
records = []
for entry in data:
    if "visit" in entry and "topCandidate" in entry["visit"]:
        place = entry["visit"]["topCandidate"]
        if "placeLocation" in place and place["placeLocation"].startswith("geo:"):
            geo_parts = place["placeLocation"].split(":")[1].split(",")  # Extract lat, lng
            lat, lon = map(float, geo_parts)

            # Charlottesville filter
            if (CHARLOTTESVILLE_BOUNDS['min_lat'] <= lat <= CHARLOTTESVILLE_BOUNDS['max_lat'] and
                CHARLOTTESVILLE_BOUNDS['min_lon'] <= lon <= CHARLOTTESVILLE_BOUNDS['max_lon']):

                start_time = pd.to_datetime(entry["startTime"])
                end_time = pd.to_datetime(entry["endTime"])
                duration = (end_time - start_time).total_seconds() / 3600  # in hours
                records.append({"latitude": lat, "longitude": lon, "duration": duration})

# Convert to DataFrame
df = pd.DataFrame(records)

# Create pairs of consecutive locations for prediction
X = []
y = []
for i in range(len(df) - 1):
    current_location = df.iloc[i]
    next_location = df.iloc[i + 1]

    # Features: current location (latitude, longitude) and duration
    X.append([current_location["latitude"], current_location["longitude"], current_location["duration"]])
    # Target: next location (latitude, longitude)
    y.append([next_location["latitude"], next_location["longitude"]])

# Convert lists to numpy arrays for easier handling
X = np.array(X)
y = np.array(y)

# Set the number of iterations (different random_state values)
num_iterations = 100

# Lists to store metrics for each iteration
rmse_list = []
mae_list = []
mean_error_miles_list = []
exact_match_accuracy_list = []

# Define threshold for "exact match" in meters
THRESHOLD_METERS = 50

for random_state in range(num_iterations):
    # Split data into training and testing sets using current random_state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Train a Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the next location on the test set
    y_pred = model.predict(X_test)

    # Compute RMSE and MAE (units are degrees)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    # Calculate mean error in miles using great_circle distance
    errors_in_miles = [
        great_circle((y_true[0], y_true[1]), (y_hat[0], y_hat[1])).miles
        for y_true, y_hat in zip(y_test, y_pred)
    ]
    mean_error = np.mean(errors_in_miles)

    # Compute thresholds in degrees:
    lat_threshold = THRESHOLD_METERS / 111000  # approx conversion for latitude
    # For longitude, compute threshold for each test sample
    lon_threshold = THRESHOLD_METERS / (111000 * np.cos(np.radians(y_test[:, 0])))

    # Count predictions within threshold for both latitude and longitude
    exact_matches = np.sum(
        (np.abs(y_pred[:, 0] - y_test[:, 0]) < lat_threshold) &
        (np.abs(y_pred[:, 1] - y_test[:, 1]) < lon_threshold)
    )
    exact_match_accuracy = exact_matches / len(y_test)

    # Store metrics for this iteration
    rmse_list.append(rmse)
    mae_list.append(mae)
    mean_error_miles_list.append(mean_error)
    exact_match_accuracy_list.append(exact_match_accuracy)

# Compute average metrics over all iterations
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_mean_error_miles = np.mean(mean_error_miles_list)
avg_exact_match_accuracy = np.mean(exact_match_accuracy_list)

print(f"Average RMSE over {num_iterations} iterations: {avg_rmse:.6f} degrees")
print(f"Average MAE over {num_iterations} iterations: {avg_mae:.6f} degrees")
print(f"Average Mean Error in Miles over {num_iterations} iterations: {avg_mean_error_miles:.6f} miles")
print(f"Average Exact Match Accuracy (within 50 meters) over {num_iterations} iterations: {avg_exact_match_accuracy*100:.2f}%")


Average RMSE over 100 iterations: 0.009213 degrees
Average MAE over 100 iterations: 0.005551 degrees
Average Mean Error in Miles over 100 iterations: 0.546282 miles
Average Exact Match Accuracy (within 50 meters) over 100 iterations: 2.71%


Visualize Linear Regression Prediction
---

In [None]:
import folium
from geopy.distance import great_circle

idx = np.random.randint(len(X_test))
actual = y_test[idx]
pred = y_pred[idx]

m = folium.Map(location=actual, zoom_start=14)
folium.Marker(actual, popup="Actual").add_to(m)
folium.Marker(pred, popup=f"Predicted\n{great_circle(actual, pred).miles:.1f}mi").add_to(m)
display(m)
m.save("location_predictor_map.html")

Probabilistic Module
---

In [None]:
from collections import defaultdict
import random
import numpy as np
from sklearn.model_selection import train_test_split
import json
import pandas as pd

# --- Data Extraction (as before) ---
with open("location-history.json", "r", encoding="utf-8") as f:
    data = json.load(f)

CHARLOTTESVILLE_BOUNDS = {
    'min_lat': 37.95,  # Southern boundary
    'max_lat': 38.15,  # Northern boundary
    'min_lon': -78.60,  # Western boundary
    'max_lon': -78.40   # Eastern boundary
}

records = []
for entry in data:
    if "visit" in entry and "topCandidate" in entry["visit"]:
        place = entry["visit"]["topCandidate"]
        if "placeLocation" in place and place["placeLocation"].startswith("geo:"):
            geo_parts = place["placeLocation"].split(":")[1].split(",")
            lat, lon = map(float, geo_parts)
            if (CHARLOTTESVILLE_BOUNDS['min_lat'] <= lat <= CHARLOTTESVILLE_BOUNDS['max_lat'] and
                CHARLOTTESVILLE_BOUNDS['min_lon'] <= lon <= CHARLOTTESVILLE_BOUNDS['max_lon']):
                start_time = pd.to_datetime(entry["startTime"])
                end_time = pd.to_datetime(entry["endTime"])
                duration = (end_time - start_time).total_seconds() / 3600
                records.append({"latitude": lat, "longitude": lon, "duration": duration})

df = pd.DataFrame(records)

# Create consecutive pairs:
# X: current location features (latitude, longitude, duration)
# y: next location (latitude, longitude)
X, y = [], []
for i in range(len(df) - 1):
    current = df.iloc[i]
    nxt = df.iloc[i + 1]
    X.append([current["latitude"], current["longitude"], current["duration"]])
    y.append([nxt["latitude"], nxt["longitude"]])
X = np.array(X)
y = np.array(y)

# --- Probabilistic Transition Model with Iterative Random Splits ---

num_iterations = 100
exact_match_accuracy_list = []
THRESHOLD_METERS = 50

for random_state in range(num_iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Build transition counts using X_train as keys and corresponding y_train as the next location.
    transition_counts = defaultdict(lambda: defaultdict(int))
    for i in range(len(X_train) - 1):
        current_loc = tuple(X_train[i])  # key is a tuple of (latitude, longitude, duration)
        next_loc = tuple(y_train[i])      # target is a 2-tuple (latitude, longitude)
        transition_counts[current_loc][next_loc] += 1

    # Convert counts to probabilities.
    transition_probs = {
        loc: {next_loc: count / sum(next_steps.values())
              for next_loc, count in next_steps.items()}
        for loc, next_steps in transition_counts.items()
    }

    def predict_next_location(test_point):
        test_tuple = tuple(test_point)
        if test_tuple in transition_probs:
            next_locs, probs = zip(*transition_probs[test_tuple].items())
            return random.choices(next_locs, weights=probs)[0]
        else:
            # Find the nearest seen key (based on latitude and longitude only)
            nearest_key = min(transition_probs.keys(),
                              key=lambda loc: np.linalg.norm(np.array(loc[:2]) - np.array(test_point[:2])))
            next_locs, probs = zip(*transition_probs[nearest_key].items())
            return random.choices(next_locs, weights=probs)[0]

    # Predict the next location for each test sample.
    # The prediction now is always a 2-tuple (latitude, longitude).
    y_pred_prob = np.array([predict_next_location(test_point) for test_point in X_test])

    # Compute thresholds in degrees for matching:
    lat_threshold = THRESHOLD_METERS / 111000  # latitude conversion
    # For longitude, adjust for the latitude of each test sample.
    lon_thresholds = THRESHOLD_METERS / (111000 * np.cos(np.radians(y_test[:, 0])))

    lat_diff = np.abs(y_pred_prob[:, 0] - y_test[:, 0])
    lon_diff = np.abs(y_pred_prob[:, 1] - y_test[:, 1])

    exact_matches = np.sum((lat_diff < lat_threshold) & (lon_diff < lon_thresholds))
    exact_match_accuracy = exact_matches / len(y_test)
    exact_match_accuracy_list.append(exact_match_accuracy)

avg_exact_match_accuracy = np.mean(exact_match_accuracy_list)
print(f"Average Exact Match Accuracy (within 50 meters, probabilistic model) over {num_iterations} iterations: {avg_exact_match_accuracy*100:.2f}%")


Average Exact Match Accuracy (within 50 meters, probabilistic model) over 100 iterations: 19.09%


Google Places
---

In [None]:
import requests
from google.colab import userdata

GOOGLE_PLACES_API_KEY = userdata.get('GOOGLE_PLACES_API_KEY')

# Google Places API URL
GOOGLE_PLACES_API_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

# Function to call Google Places API
def get_google_place_info(lat, lon, radius=50):
    params = {
        "location": f"{lat},{lon}",
        "radius": radius,
        "key": GOOGLE_PLACES_API_KEY
    }
    response = requests.get(GOOGLE_PLACES_API_URL, params=params)
    results = response.json().get("results", [])

    if results:
        top_result = results[0]
        name = top_result.get("name", "Unknown")
        address = top_result.get("vicinity", "No address available")
        categories = [type_ for type_ in top_result.get("types", [])]
        return {"name": name, "address": address, "categories": categories}
    return None

# Call the API for each of the top 10 locations
print("\nFetching place details from Google Places API...\n")
for index, row in top_10_locations.iterrows():
    place_info = get_google_place_info(row["latitude"], row["longitude"])
    if place_info:
        print(f"Location: ({row['latitude']}, {row['longitude']}), Time Spent: {row['duration']:.2f} hours")
        print(f"  - Name: {place_info['name']}")
        print(f"  - Address: {place_info['address']}")
        print(f"  - Categories: {', '.join(place_info['categories'])}\n")
    else:
        print(f"Location: ({row['latitude']}, {row['longitude']}) - No details found.\n")


Foursquare
---



In [None]:
import requests

from google.colab import userdata

FOURSQUARE_API_KEY = userdata.get('YOUR_FOURSQUARE_API_KEY')

# Foursquare Places API URL
FOURSQUARE_API_URL = "https://api.foursquare.com/v3/places/search"

# Headers for API request
HEADERS = {
    "Accept": "application/json",
    "Authorization": FOURSQUARE_API_KEY
}

# Function to call Foursquare API
def get_foursquare_place_info(lat, lon, radius=50):
    params = {
        "ll": f"{lat},{lon}",
        "radius": radius,
        "limit": 1  # Get the closest place
    }
    response = requests.get(FOURSQUARE_API_URL, headers=HEADERS, params=params)
    results = response.json().get("results", [])

    if results:
        top_result = results[0]
        name = top_result.get("name", "Unknown")
        address = top_result.get("location", {}).get("formatted_address", "No address available")
        categories = [cat["name"] for cat in top_result.get("categories", [])]
        return {"name": name, "address": address, "categories": categories}
    return None

# Call the API for each of the top 10 locations
print("\nFetching place details from Foursquare API...\n")
for index, row in top_10_locations.iterrows():
    place_info = get_foursquare_place_info(row["latitude"], row["longitude"])
    if place_info:
        print(f"Location: ({row['latitude']}, {row['longitude']}), Time Spent: {row['duration']:.2f} hours")
        print(f"  - Name: {place_info['name']}")
        print(f"  - Address: {place_info['address']}")
        print(f"  - Categories: {', '.join(place_info['categories'])}\n")
    else:
        print(f"Location: ({row['latitude']}, {row['longitude']}) - No details found.\n")
