In [None]:
!pip install geopy

In [None]:
import pandas as pd
import numpy as np
import re
from geopy.distance import geodesic
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('2023_dataset.csv')
df.head()

# Drop rows with missing values in key columns
df = df.dropna(subset=['ride_id', 'started_at', 'ended_at', 'start_lat', 'start_lng', 'end_lat', 'end_lng'])

# Function to correct year format
def correct_year(date_str):
    if isinstance(date_str, str):
        return re.sub(r'^0{2,}23', '2023', date_str)
    return date_str

# Apply year correction
df['started_at'] = df['started_at'].apply(correct_year)
df['ended_at'] = df['ended_at'].apply(correct_year)

# Convert to datetime format
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

# Calculate ride duration in minutes
df['ride_length_minutes'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# Filter out negative or extreme values (less than 1 min or more than 24 hrs)
df = df[(df['ride_length_minutes'] > 1) & (df['ride_length_minutes'] < 1440)]

# Add time-related fields
df['weekday'] = df['started_at'].dt.day_name()
df['month'] = df['started_at'].dt.month
df['start_hour'] = df['started_at'].dt.hour

# Calculate ride distance (in miles) using Haversine formula
def calculate_distance(row):
    start = (row['start_lat'], row['start_lng'])
    end = (row['end_lat'], row['end_lng'])
    return geodesic(start, end).miles

df['distance_miles'] = df.apply(calculate_distance, axis=1)

# Filter for casual users
casual_df = df[df['member_casual'] == 'casual'].copy()
casual_df['duration_min'] = casual_df['ride_length_minutes']
casual_df['hour'] = casual_df['start_hour']
casual_df['weekday_name'] = casual_df['weekday']

# Construct user behavior features
user_features = casual_df.groupby('ride_id').agg({
    'duration_min': 'mean',
    'distance_miles': 'mean',
    'hour': lambda x: x.mode()[0] if not x.mode().empty else x.median(),
    'weekday_name': lambda x: x.mode()[0] if not x.mode().empty else x.sample(1).values[0],
    'start_lat': 'median',
    'start_lng': 'median'
}).reset_index()

# Rename columns
user_features.columns = ['ride_id', 'avg_duration', 'avg_distance', 'common_hour', 'common_day', 'median_lat', 'median_lng']

# Select features for clustering
features = ['avg_duration', 'avg_distance', 'common_hour', 'median_lat', 'median_lng']
X = user_features[features]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow method to determine optimal K
sse = []
k_range = range(1, 10)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

# Plot SSE vs. K
plt.figure(figsize=(8, 4))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Number of clusters (K)')
plt.ylabel('SSE')
plt.title('Elbow Method for Determining Optimal K')
plt.show()

# Apply MiniBatch K-Means clustering
k = 2  # Set to optimal K if determined
mbk = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=1000)
user_features['cluster'] = mbk.fit_predict(X_scaled)

# Cluster feature summary
cluster_summary = user_features.groupby('cluster')[features].mean().round(2)
print(cluster_summary)

# Optional: PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
user_features['pca1'] = X_pca[:, 0]
user_features['pca2'] = X_pca[:, 1]

plt.figure(figsize=(8, 6))
for c in range(k):
    subset = user_features[user_features['cluster'] == c]
    plt.scatter(subset['pca1'], subset['pca2'], label=f'Cluster {c}', alpha=0.6)
plt.legend()
plt.title("MiniBatch K-Means Cluster Visualization (PCA)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid(True)
plt.tight_layout()
plt.show()
