# AI Traffic Prediction Model

### 1. Data Preprocessing & Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle

In [None]:
# Load the dataset
df = pd.read_csv('traffic.csv')

# Convert to datetime and extract time-based features
df['DateTime'] = pd.to_datetime(df['DateTime'])

# --- NEW: ONE-HOT ENCODING ---
# Convert the 'Junction' column into separate indicator columns
junction_dummies = pd.get_dummies(df['Junction'], prefix='Junction')
df = pd.concat([df, junction_dummies], axis=1)
# We no longer need the original 'Junction' column
df = df.drop(columns=['Junction'])

df['day_of_week'] = df['DateTime'].dt.dayofweek # Monday=0, Sunday=6
df['hour_of_day'] = df['DateTime'].dt.hour
df['is_weekend'] = ((df['day_of_week'] == 5) | (df['day_of_week'] == 6)).astype(int)

# Set DateTime as the index to work with time-series features
df = df.set_index('DateTime').sort_index()

In [4]:
# Create lag and rolling features based on hourly data
df['vehicles_1_hour_ago'] = df['Vehicles'].shift(1)
df['vehicles_2_hours_ago'] = df['Vehicles'].shift(2)
df['vehicles_rolling_mean_3h'] = df['Vehicles'].rolling(window=3).mean()

# Drop rows with NaN values created by shifts and rolling windows
df.dropna(inplace=True)

# Drop columns that are no longer needed for the model
df = df.drop(columns=['ID'])

print("Feature engineering complete. Data head:")
df.head()

Feature engineering complete. Data head:


Unnamed: 0_level_0,Junction,Vehicles,day_of_week,hour_of_day,is_weekend,vehicles_1_hour_ago,vehicles_2_hours_ago,vehicles_rolling_mean_3h
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-11-01 00:00:00,3,9,6,0,1,6.0,15.0,10.0
2015-11-01 01:00:00,3,7,6,1,1,9.0,6.0,7.333333
2015-11-01 01:00:00,1,13,6,1,1,7.0,9.0,9.666667
2015-11-01 01:00:00,2,6,6,1,1,13.0,7.0,8.666667
2015-11-01 02:00:00,2,5,6,2,1,6.0,13.0,8.0


### 2. Train/Test Split

In [5]:
# The target is what you want to predict
target_column = 'Vehicles' 
features = [col for col in df.columns if col != target_column]

X = df[features]
y = df[target_column]

# Use the last 10% of the data for testing to simulate a real-world scenario
cutoff_index = int(len(df) * 0.90)
X_train, X_test = X[:cutoff_index], X[cutoff_index:]
y_train, y_test = y[:cutoff_index], y[cutoff_index:]

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 43306
Test set size: 4812


### 3. Model Training

In [6]:
# Initialize the LightGBM REGRESSOR for predicting a number
model = lgb.LGBMRegressor(random_state=42)

# Train the model on your actual traffic data
print("--- Training LightGBM Regressor ---")
model.fit(X_train, y_train)
print("Training complete.")

--- Training LightGBM Regressor ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002770 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 490
[LightGBM] [Info] Number of data points in the train set: 43306, number of used features: 7
[LightGBM] [Info] Start training from score 21.961922
Training complete.


### 4. Model Evaluation

In [7]:
# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate error metrics
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"\n--- Model Performance ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


--- Model Performance ---
Mean Absolute Error (MAE): 2.82
Root Mean Squared Error (RMSE): 4.43


### 5. Export the Model Using Pickle

In [8]:
# Define the filename for your model
model_filename = "traffic_model.pkl"

# Open a file in write-binary mode and save the model
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"\nModel successfully saved to '{model_filename}'")


Model successfully saved to 'traffic_model.pkl'


In [9]:
# --- EXPORT THE TEST DATA --- 
test_data_filename = "X_test_data.csv"
# We save the index because it contains the crucial DateTime information
X_test.to_csv(test_data_filename, index=True)

print(f"Test data successfully saved to '{test_data_filename}'")

Test data successfully saved to 'X_test_data.csv'
