# Machine Learning

### I. Import Libraries

In [38]:
# Import libraries
import numpy as np
import pandas as pd
import requests
import sklearn
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import json


from pytz import timezone
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from datetime import datetime, timedelta

%matplotlib inline 

print("Library verions:")
print("- numpy:", np.__version__)
print("- pandas:", pd.__version__)
print("- requests:", requests.__version__)
print("- sklearn:", sklearn.__version__)
print("- seaborn:", sns.__version__)

Library verions:
- numpy: 2.2.0
- pandas: 2.3.2
- requests: 2.32.5
- sklearn: 1.7.2
- seaborn: 0.13.2


### II. Data Collection

In [5]:
# Set API key (for traffic data and weather data)
TOMTOM_API_KEY = os.environ.get("TOMTOM_API_KEY")
OPENWEATHER_API_KEY = os.environ.get("OPENWEATHER_API_KEY")
# Coordinates for key locations (latitude, longitude) in HCMC
hcmc_locations = {
    "District_1": {"lat": 10.7757, "lon": 106.7009},
    "Thu_Thiem": {"lat": 10.7835, "lon": 106.7215},
    "Tan_Son_Nhat": {"lat": 10.8181, "lon": 106.6519},
    "Binh_Thanh": {"lat": 10.8106, "lon": 106.7091},
    "Phu_My_Hung": {"lat": 10.7272, "lon": 106.7057}
}

hcmc_tz = timezone("Asia/Ho_Chi_Minh")

# Set time range for data generation
start_date = datetime(2025, 9, 16, 22, 0, 0, tzinfo=hcmc_tz) 
end_date = datetime(2025, 9, 17, 0, 0, 0, tzinfo=hcmc_tz) # Sep 17, 2025

# Generate list of hourly timestamps
timestamps = []
current_time = start_date
while current_time <= end_date:
    timestamps.append(current_time)
    current_time += timedelta(hours=1)

print("HCMC Timezone:", hcmc_tz)
print("Start date (HCMC):", start_date.strftime("%Y-%m-%d %H:%M:%S %Z%z"))
print("End date (HCMC):", end_date.strftime("%Y-%m-%d %H:%M:%S %Z%z"))
print(f"Collecting data for {len(timestamps)} hourly intervals...")

HCMC Timezone: Asia/Ho_Chi_Minh
Start date (HCMC): 2025-09-16 22:00:00 LMT+0707
End date (HCMC): 2025-09-17 00:00:00 LMT+0707
Collecting data for 3 hourly intervals...


In [None]:
# Function to get traffic data from TomTom API
def get_traffic_data(lat, lon, api_key, timestamp):
    url = "https://api.tomtom.com/traffic/services/4/flowSegmentData/absolute/10/json"
    params = {
        'point': f"{lat},{lon}",
        'unit': 'KMPH',
        'key': api_key,
        'zoom': 12
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        return data
    except Exception as e:
        print(f"Error fetching traffic data for {lat},{lon} at {timestamp}: {e}")
        return None

# Function to get weather data from OpenWeatherMap API
def get_weather_data(lat, lon, api_key, timestamp):
    url = "http://api.openweathermap.org/data/2.5/weather"
    params = {
        'lat': lat,
        'lon': lon,
        'appid': api_key,
        'units': 'metric'
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        return data
    except Exception as e:
        print(f"Error fetching weather data for {lat},{lon} at {timestamp}: {e}")
        return None


### III. Process and merge datasets (weather and traffic data)

In [40]:
# Function to process traffic data
def process_traffic_data(traffic_data_dict, location_name, timestamp):
    processed_data = []
    if traffic_data_dict and 'flowSegmentData' in traffic_data_dict:
        segment = traffic_data_dict['flowSegmentData']
        processed_data.append({
            'Location': location_name,
            'Timestamp': timestamp,
            'Current_Speed': segment.get('currentSpeed', 0),
            'Free_Flow_Speed': segment.get('freeFlowSpeed', 0),
            'Confidence': segment.get('confidence', 0),
            'Congestion_Level': segment.get('currentTravelTime', 0) / segment.get('freeFlowTravelTime', 1) * 100,
            'Road_Closure': segment.get('roadClosure', False)
        })
    return processed_data

# Function to process weather data
def process_weather_data(weather_data_dict, timestamp):
    if weather_data_dict and 'main' in weather_data_dict:
        return {
            'Timestamp': timestamp,
            'Temperature': weather_data_dict['main'].get('temp', 0),
            'Humidity': weather_data_dict['main'].get('humidity', 0),
            'Wind_Speed': weather_data_dict['wind'].get('speed', 0) if 'wind' in weather_data_dict else 0,
            'Rainfall': weather_data_dict['rain']['1h'] if 'rain' in weather_data_dict and '1h' in weather_data_dict['rain'] else 0
        }
    return None

# Collect and process data for each hour
all_traffic_data = []
all_weather_data = []

for timestamp in timestamps:
    print(f"\nCollecting data for {timestamp.strftime('%Y-%m-%d %H:%M:%S %Z')}...")
    
    # Collect traffic data
    traffic_data = {}
    for name, coords in hcmc_locations.items():
        # print(f"Fetching traffic data for {name}...")
        data = get_traffic_data(coords["lat"], coords["lon"], TOMTOM_API_KEY, timestamp)
        if data:
            traffic_data[name] = data
        time.sleep(1) 
    
    # Collect weather data
    weather_data = {}
    for name, coords in hcmc_locations.items():
        # print(f"Fetching weather data for {name}...")
        data = get_weather_data(coords["lat"], coords["lon"], OPENWEATHER_API_KEY, timestamp)
        if data:
            weather_data[name] = data
        time.sleep(1) 
    
    # Process traffic data
    for name, data in traffic_data.items():
        processed_traffic = process_traffic_data(data, name, timestamp)
        all_traffic_data.extend(processed_traffic)
    
    # Process weather data
    for name, data in weather_data.items():
        processed_weather = process_weather_data(data, timestamp)
        if processed_weather:
            processed_weather['Location'] = name
            all_weather_data.append(processed_weather)

# Create DataFrames
traffic_df = pd.DataFrame(all_traffic_data)
weather_df = pd.DataFrame(all_weather_data)

# Merge datasets on Timestamp and Location
if not traffic_df.empty and not weather_df.empty:
    merged_df = pd.merge(
        traffic_df,
        weather_df,
        on=['Timestamp', 'Location'],
        how='left'
    )
else:
    merged_df = traffic_df if not traffic_df.empty else weather_df

# Add time-based features
if not merged_df.empty:
    merged_df['Hour'] = merged_df['Timestamp'].dt.hour
    merged_df['Month'] = merged_df['Timestamp'].dt.month
    merged_df['Day_of_Week'] = merged_df['Timestamp'].dt.dayofweek
    merged_df['Is_Weekend'] = (merged_df['Day_of_Week'] >= 5).astype(int)
    merged_df['Is_Rush_Hour'] = (
        ((merged_df['Hour'] >= 7) & (merged_df['Hour'] <= 9)) |
        ((merged_df['Hour'] >= 16) & (merged_df['Hour'] <= 19))
    ).astype(int)

# Display the data
print("\nFull Hourly Data:")
print(merged_df.to_string(index=False))
merged_df.sample(15)


Collecting data for 2025-09-16 22:00:00 LMT...

Collecting data for 2025-09-16 23:00:00 LMT...

Collecting data for 2025-09-17 00:00:00 LMT...

Full Hourly Data:
    Location                 Timestamp  Current_Speed  Free_Flow_Speed  Confidence  Congestion_Level  Road_Closure  Temperature  Humidity  Wind_Speed  Rainfall  Hour  Month  Day_of_Week  Is_Weekend  Is_Rush_Hour
  District_1 2025-09-16 21:53:00+07:00             26               26           1             100.0         False        26.10        84        0.00         0    21      9            1           0             0
   Thu_Thiem 2025-09-16 21:53:00+07:00             40               40           1             100.0         False        26.16        84        0.00         0    21      9            1           0             0
Tan_Son_Nhat 2025-09-16 21:53:00+07:00             31               31           1             100.0         False        25.01        94        0.45         0    21      9            1           0    

Unnamed: 0,Location,Timestamp,Current_Speed,Free_Flow_Speed,Confidence,Congestion_Level,Road_Closure,Temperature,Humidity,Wind_Speed,Rainfall,Hour,Month,Day_of_Week,Is_Weekend,Is_Rush_Hour
5,District_1,2025-09-16 22:53:00+07:00,26,26,1,100.0,False,26.1,84,0.0,0,22,9,1,0,0
8,Binh_Thanh,2025-09-16 22:53:00+07:00,33,33,1,100.0,False,26.16,84,0.0,0,22,9,1,0,0
10,District_1,2025-09-16 23:53:00+07:00,26,26,1,100.0,False,26.1,84,0.0,0,23,9,1,0,0
12,Tan_Son_Nhat,2025-09-16 23:53:00+07:00,31,31,1,100.0,False,25.01,94,0.45,0,23,9,1,0,0
2,Tan_Son_Nhat,2025-09-16 21:53:00+07:00,31,31,1,100.0,False,25.01,94,0.45,0,21,9,1,0,0
9,Phu_My_Hung,2025-09-16 22:53:00+07:00,42,42,1,100.0,False,26.17,84,0.0,0,22,9,1,0,0
0,District_1,2025-09-16 21:53:00+07:00,26,26,1,100.0,False,26.1,84,0.0,0,21,9,1,0,0
7,Tan_Son_Nhat,2025-09-16 22:53:00+07:00,31,31,1,100.0,False,25.01,94,0.45,0,22,9,1,0,0
1,Thu_Thiem,2025-09-16 21:53:00+07:00,40,40,1,100.0,False,26.16,84,0.0,0,21,9,1,0,0
11,Thu_Thiem,2025-09-16 23:53:00+07:00,40,40,1,100.0,False,26.16,84,0.0,0,23,9,1,0,0


### IV. Machine Learning Models 
1. KNN Regression
2. Linear Regression    
3. Decision Tree Regression

In [None]:
# 1. Prepare data for machine learning
# Define features and target variable
features = [
    'Free_Flow_Speed', 'Confidence', 'Congestion_Level', 'Road_Closure',
    'Temperature', 'Humidity', 'Wind_Speed', 'Rainfall',
    'Hour', 'Month', 'Day_of_Week', 'Is_Weekend', 'Is_Rush_Hour'
]
target = 'Current_Speed'    
X = merged_df[features] 
y = merged_df[target]   

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"\nTraining samples: {len(X_train)}, Testing samples: {len(X_test)}")

# 3. Scale features for KNN 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# MODEL 1: K-Nearest Neighbors Regression
print("\nTraining KNN Regression model...")
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)  
knn_predictions = knn_model.predict(X_test_scaled)  
knn_mae = mean_absolute_error(y_test, knn_predictions)
knn_mse = mean_squared_error(y_test, knn_predictions)   
knn_r2 = r2_score(y_test, knn_predictions)
print(f"KNN MAE: {knn_mae}, MQE: {knn_mse} R²: {knn_r2}")

# MODEL 2: Linear Regression
print("\nTraining Linear Regression model...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_mqe = mean_squared_error(y_test, lr_predictions) 
lr_r2 = r2_score(y_test, lr_predictions)
print(f"Linear Regression MAE: {lr_mae}, MQE: {lr_mqe} R²: {lr_r2}")

# MODEL 3: Decision Tree Regression
print("\nTraining Decision Tree Regression model...")
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_mqe = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)
print(f"Decision Tree MAE: {dt_mae}, MQE: {dt_mqe} R²: {dt_r2}")

# Compare model performances
comparison_df = pd.DataFrame({
    'Model': ['KNN Regression', 'Linear Regression', 'Decision Tree Regression'],
    'MAE': [knn_mae, lr_mae, dt_mae],
    'MSE': [knn_mse, lr_mqe, dt_mqe],
    'R²': [knn_r2, lr_r2, dt_r2]
})  
print("="*50)
print(comparison_df.to_string(index=False))




Training samples: 10, Testing samples: 5

Training KNN Regression model...
KNN MAE: 6.7200000000000015, MQE: 58.88000000000002 R²: 0.4411541381928623

Training Linear Regression model...
Linear Regression MAE: 0.08322343530367107, MQE: 0.01456059165051849 R²: 0.99986180152192

Training Decision Tree Regression model...
Decision Tree MAE: 0.2, MQE: 0.2 R²: 0.9981017463933182
                   Model      MAE       MSE       R²
          KNN Regression 6.720000 58.880000 0.441154
       Linear Regression 0.083223  0.014561 0.999862
Decision Tree Regression 0.200000  0.200000 0.998102


### Conclusion
Linear regression give a near-perfect result with MAE and MQE nearby 0.0 and R² close with 1.0

KNN is not suitable for my data pattern