In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 

In [2]:
#Defining Datasets and Global Variables

zip_code = int(input('Please Enter the Zip Code: '))

df1 = pd.read_csv('uszips_2.csv')

df2 = pd.read_csv('us_congestion_2016_2022_sample_2m.csv', usecols=['Severity', 'Start_Lat', 'Start_Lng', 'StartTime', 'EndTime', 'Distance(mi)', 'DelayFromTypicalTraffic(mins)', 'DelayFromFreeFlowSpeed(mins)'])
            
df2['Year'] = df2['StartTime'].str.slice(0, 4).astype('Int64')  # allows NaNs

df2.drop(['StartTime', 'EndTime'], axis=1, inplace=True)

df2.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),DelayFromTypicalTraffic(mins),DelayFromFreeFlowSpeed(mins),Year
0,2,39.191032,-120.81974,1.4,2.58,2.6,2016
1,0,41.736015,-87.721565,0.73,0.42,1.0,2018
2,0,32.519043,-93.741096,1.8,1.0,2.0,2021
3,0,40.730564,-74.001709,1.42,1.0,2.0,2020
4,1,33.758331,-118.238533,2.6,4.9,6.92,2017


In [3]:
#Zip Code to Long/Lat Method



def zip_to_long_lat(zipcode):
    match = df1[df1['zip'] == zipcode]
    if match.empty:
        print(f"No match found for zip code {zipcode}")
        return
    lat = match['lat'].values[0]
    long = match['lng'].values[0]
    coord = (lat, long)
    print(f"The city for zip code {zipcode} is: {coord}")
    return coord

In [4]:
X = df2.drop(['DelayFromTypicalTraffic(mins)'], axis=1)
y = df2['DelayFromTypicalTraffic(mins)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train.astype('float32'), label=y_train)
test_data = lgb.Dataset(X_test.astype('float32'), label=y_test, reference=train_data)

In [5]:
# Train the model

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l1',
    'learning_rate': 0.005,
    'num_leaves': 128,
    'max_depth': 8,
    'n_estimators': 10000,
    'max_bin': 512
}

num_round = 10000 
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])





[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2062
[LightGBM] [Info] Number of data points in the train set: 1600000, number of used features: 6
[LightGBM] [Info] Start training from score 2.814723


In [6]:
# Predict on the test set
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Evaluate using RMSE
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 1.2294055219902136
