TRAINING SCRIPT 

In [15]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

# -------------------------
# 1. Load your dataset
# -------------------------
df = pd.read_csv("final.csv")

# -------------------------
# 2. Define features and target
# -------------------------
feature_cols = [
    "SPEED", "TRAVEL_TIME", "hour", "day_of_week",
    "lag_1h","lag_2h","lag_3h","lag_4h","lag_5h","lag_6h",
    "lag_7h","lag_8h","lag_9h","lag_10h","lag_11h","lag_12h",
    "free_flow_speed", "congestion_ratio",
    "BOROUGH_Brooklyn","BOROUGH_Manhattan","BOROUGH_Queens","BOROUGH_Staten Island"
]
X = df[feature_cols]
y = df["future_label"]  # predicting 1h later traffic

# -------------------------
# 3. Chronological split (80/20)
# -------------------------
split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# -------------------------
# 4. Train smaller Random Forest
# -------------------------
clf = RandomForestClassifier(
    n_estimators=50,      # smaller model
    max_depth=15,         # limit depth
    min_samples_leaf=20,  # avoid overfitting
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

print("\n🚀 Training model...")
clf.fit(X_train, y_train)

# -------------------------
# 5. Evaluate
# -------------------------
y_pred = clf.predict(X_test)
print("\n✅ Model training complete!\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# -------------------------
# 6. Save model with lz4 fallback
# -------------------------
def save_model(model, filename="rf_traffic_model_final.pkl"):
    try:
        joblib.dump(model, filename, compress=("lz4", 3))
        print(f"\n✅ Model saved with lz4 compression: {filename}")
    except Exception as e:
        print(f"\n❌ lz4 compression failed: {e}")
        joblib.dump(model, filename, compress=3)
        print(f"✅ Model saved with zlib compression: {filename}")

    # Reload test
    loaded = joblib.load(filename)
    print("✅ Model reloaded successfully!")

    # Print model size
    size_mb = os.path.getsize(filename) / (1024 * 1024)
    print(f"📦 Saved model size: {size_mb:.2f} MB")

    return loaded

loaded_model = save_model(clf, "rf_traffic_model_final.pkl")



🚀 Training model...

✅ Model training complete!

Confusion Matrix:
[[1098606    5196  157495]
 [    337    2883    3354]
 [ 115819   23504  515357]]

Classification Report:
              precision    recall  f1-score   support

       Heavy       0.90      0.87      0.89   1261297
       Light       0.09      0.44      0.15      6574
    Moderate       0.76      0.79      0.77    654680

    accuracy                           0.84   1922551
   macro avg       0.59      0.70      0.60   1922551
weighted avg       0.85      0.84      0.85   1922551


❌ lz4 compression failed: LZ4 is not installed. Install it with pip: https://python-lz4.readthedocs.io/
✅ Model saved with zlib compression: rf_traffic_model_final.pkl
✅ Model reloaded successfully!
📦 Saved model size: 49.45 MB


MODEL OUTPUT

In [None]:
import pandas as pd

# Feature columns same as training
feature_cols = [
    "SPEED", "TRAVEL_TIME", "hour", "day_of_week",
    "lag_1h","lag_2h","lag_3h","lag_4h","lag_5h","lag_6h",
    "lag_7h","lag_8h","lag_9h","lag_10h","lag_11h","lag_12h",
    "free_flow_speed", "congestion_ratio",
    "BOROUGH_Brooklyn","BOROUGH_Manhattan","BOROUGH_Queens","BOROUGH_Staten Island"
]

# Second synthetic "Light" traffic row
light_input_2 = pd.DataFrame([{
    "SPEED": 60.0,
    "TRAVEL_TIME": 100,
    "hour": 11,
    "day_of_week": 6,   # weekend
    "lag_1h": 58.0,
    "lag_2h": 59.5,
    "lag_3h": 60.0,
    "lag_4h": 57.8,
    "lag_5h": 59.0,
    "lag_6h": 58.5,
    "lag_7h": 59.5,
    "lag_8h": 60.0,
    "lag_9h": 58.8,
    "lag_10h": 59.0,
    "lag_11h": 60.0,
    "lag_12h": 59.2,
    "free_flow_speed": 65.0,
    "congestion_ratio": 0.02,
    "BOROUGH_Brooklyn": 0,
    "BOROUGH_Manhattan": 0,
    "BOROUGH_Queens": 0,
    "BOROUGH_Staten Island": 1
}])

# Ensure columns match
light_input_2 = light_input_2[feature_cols]

# Predict (assuming 'model' is already loaded)
y_pred = model.predict(light_input_2)[0]
print("🚦 Predicted traffic label:", y_pred)


🚦 Predicted traffic label: Heavy


In [11]:
import joblib
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# --- dummy model just for testing ---
X = np.random.rand(100, 5)
y = np.random.randint(0, 2, size=100)
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X, y)

def safe_save_model(model, filename="model.pkl"):
    compressors = [("lz4", 3), ("zstd", 3), 3]  # try lz4 → zstd → zlib
    for comp in compressors:
        try:
            joblib.dump(model, filename, compress=comp)
            loaded = joblib.load(filename)
            print(f"✅ Successfully saved & loaded with compression={comp}")
            return
        except Exception as e:
            print(f"❌ Failed with {comp}: {e}")
    print("⚠️ Could not save with compression. Try without compress.")

# Run the check
safe_save_model(clf, "rf_traffic_model_test.pkl")


❌ Failed with ('lz4', 3): LZ4 is not installed. Install it with pip: https://python-lz4.readthedocs.io/
❌ Failed with ('zstd', 3): Non valid compression method given: "zstd". Possible values are {'zlib': <joblib.compressor.ZlibCompressorWrapper object at 0x00000209D3CF0940>, 'gzip': <joblib.compressor.GzipCompressorWrapper object at 0x00000209D3CF09A0>, 'bz2': <joblib.compressor.BZ2CompressorWrapper object at 0x00000209D3CF16F0>, 'lzma': <joblib.compressor.LZMACompressorWrapper object at 0x00000209D3CF17E0>, 'xz': <joblib.compressor.XZCompressorWrapper object at 0x00000209D3CF18D0>, 'lz4': <joblib.compressor.LZ4CompressorWrapper object at 0x00000209D3CF1E10>}.
✅ Successfully saved & loaded with compression=3
