In [1]:
from sklearn.cluster import KMeans
from geopy.distance import geodesic
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load your taxi and taxi stand data
taxi_data = pd.read_csv('anonymized-taxi-data.csv')
taxi_stand_data = pd.read_excel('TaxiRanks.xlsx')

In [3]:
# Extract pick-up locations (StartLat, StartLon)
pickup_coords = taxi_data[['StartLat', 'StartLon']].dropna()

# Clustering high-demand areas
kmeans = KMeans(n_clusters=50, random_state=42).fit(pickup_coords)
pickup_coords['Cluster'] = kmeans.labels_

In [4]:
# Function to calculate the distance to the nearest taxi stand
def distance_to_nearest_stand(cluster_center, taxi_stands):
    distances = taxi_stands.apply(lambda row: geodesic(cluster_center, (row['Latitude'], row['Longitude'])).meters, axis=1)
    return distances.min()

In [5]:
# Calculate cluster centers
cluster_centers = kmeans.cluster_centers_
taxi_stands = taxi_stand_data[['Latitude', 'Longitude']]

# Check each cluster for proximity to nearest taxi stand
high_demand_clusters = []
for i, center in enumerate(cluster_centers):
    dist = distance_to_nearest_stand(center, taxi_stands)
    if dist > 500:  # Threshold of 500 meters
        high_demand_clusters.append(i)

# Label clusters as high-demand or covered
pickup_coords['Label'] = pickup_coords['Cluster'].apply(lambda x: 'High-demand without taxi stand' if x in high_demand_clusters else 'Low-demand or covered')

In [6]:
pickup_coords.to_csv('taxi-training-data.csv',index=False)