In [None]:
# --------------------------------------------------
# 1️Setup & Load
# --------------------------------------------------

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(r"C:\Users\Sinah\OneDrive\Email attachments\Desktop\projects\voi-micromobility-dashboard\data\voi_rides_stockholm.csv")

# Preview
print(df.shape)
df.head()

(5000, 10)


Unnamed: 0,ride_id,start_time,duration_minutes,end_time,start_zone,end_zone,distance_km,user_type,day_of_week,hour
0,c05c91f9-5e61-4e99-80d1-04a88a1892c5,2024-06-01 00:00:00,9,2024-06-01 00:09:00,Bromma,Bromma,1.55,subscriber,Saturday,0
1,93ac60a1-83ee-4086-b120-442415c223c9,2024-06-01 00:01:00,22,2024-06-01 00:23:00,Enskede,Vasastan,3.95,subscriber,Saturday,0
2,ec883c19-440f-4967-9988-b04689b6c324,2024-06-01 00:02:00,17,2024-06-01 00:19:00,Norrmalm,Hägersten,2.29,subscriber,Saturday,0
3,2304f573-006d-4092-bf30-ef4ff1d124e0,2024-06-01 00:03:00,13,2024-06-01 00:16:00,Norrmalm,Södermalm,2.28,casual,Saturday,0
4,e05527b1-425f-40ee-a1ad-1d7027aa11f6,2024-06-01 00:04:00,10,2024-06-01 00:14:00,Östermalm,Vasastan,2.61,subscriber,Saturday,0


In [None]:
# --------------------------------------------------
# 2️Basic EDA
# --------------------------------------------------

# Check nulls
df.info()
df.isnull().sum()

# Basic stats
df.describe()

# Trip duration distribution
sns.histplot(df['duration_minutes'], bins=20, kde=True)
plt.title("Trip Duration (minutes)")

# Rides by start zone
df['start_zone'].value_counts().plot(kind='barh')
plt.title("Trips by Start Zone")
plt.xlabel("Trip Count")

# Rides by hour of day
sns.countplot(data=df, x="hour")
plt.title("Trips by Hour")


In [4]:
# --------------------------------------------------
# 3️ Feature Engineering
# --------------------------------------------------

# Trip speed estimate
df['speed_kmph'] = df['distance_km'] / (df['duration_minutes'] / 60)

# Flag peak hours (7–9 AM, 4–7 PM)
df['is_peak_hour'] = df['hour'].apply(lambda x: 1 if (7 <= x <= 9) or (16 <= x <= 19) else 0)

# Example: Average speed by user type
df.groupby('user_type')['speed_kmph'].mean()


user_type
casual        12.038840
subscriber    12.008562
Name: speed_kmph, dtype: float64

In [5]:
# --------------------------------------------------
# 4️⃣ Save Clean Version for Tableau
# --------------------------------------------------

df.to_csv("voi_rides_cleaned.csv", index=False)
print("Cleaned file saved as voi_rides_cleaned.csv")


Cleaned file saved as voi_rides_cleaned.csv
