# Transportation Hub Model Play

This file is a space for tinkering with prototype models using transportation hub data.

In [30]:
# Import libraries.
import numpy as np
import snowflake.connector
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
# Set global variables.
CREDENTIALS_FILE = 'credentials.txt'
DATABASE_ROOT = 'TRANSPORTATION_HUB.HUB'

In [3]:
# Read in Snowflake credentials.
with open(CREDENTIALS_FILE, 'r') as f:
    creds = f.readlines()
    USER = creds[0].strip()
    PASSWORD = creds[1].strip()
    ACCOUNT = creds[2].strip()

In [4]:
# Open a connection and test that it works.
con = snowflake.connector.connect(
    user=USER,
    password=PASSWORD,
    account=ACCOUNT,
)
cs = con.cursor()
cs.execute("SELECT current_version()")
one_row = cs.fetchone()
print(one_row[0])

8.21.1


### "Dumb" Delay Predictions via Linear Regression

**Description**: 

This is an intentionally bad model that attempts to predict a trip delay using a single-variable Linear Regression model.

**Notes**:
* When using [Time Series](https://www.tableau.com/learn/articles/time-series-analysis) data, you should generally use Time Series Analysis models like [Exponential Smoothing](https://www.statsmodels.org/dev/generated/statsmodels.tsa.holtwinters.ExponentialSmoothing.html).

In [5]:
# Import data as a Pandas Dataframe and preview.
cs.execute(f"SELECT * FROM {DATABASE_ROOT}.TRIP_DELAYS")
trip_delay_data = cs.fetch_pandas_all()
trip_delay_data

Unnamed: 0,TRIP_DELAY_ID,YEAR,MONTH,DAY,DAY_OF_WEEK,TIME,TIMESTAMP,TRIP_ID,TRIP_STOP_SEQUENCE,STOP_LOCATION_ID,DELAY
0,358857,2023,9,17,Sun,04:24:08,2023-09-17 04:24:08,12769269,1,9848,0.0
1,358926,2023,9,17,Sun,04:45:36,2023-09-17 04:45:36,12769269,8,9834,54.0
2,359022,2023,9,17,Sun,05:25:28,2023-09-17 05:25:28,12769269,22,8336,150.0
3,358871,2023,9,17,Sun,04:28:17,2023-09-17 04:28:17,12769269,1,9848,0.0
4,358909,2023,9,17,Sun,04:37:57,2023-09-17 04:37:57,12769269,5,9838,-17.0
...,...,...,...,...,...,...,...,...,...,...,...
1442288,1442256,2023,9,25,Mon,07:20:48,2023-09-25 07:20:48,12810486,11,13713,286.0
1442289,1442257,2023,9,25,Mon,07:21:04,2023-09-25 07:21:04,12810486,11,13713,287.0
1442290,1442265,2023,9,25,Mon,07:24:52,2023-09-25 07:24:52,12810486,13,13715,282.0
1442291,1442271,2023,9,25,Mon,07:27:10,2023-09-25 07:27:10,12810486,14,13716,303.0


In [18]:
# Filter out records with NaN inputs or outputs.
filtered_trip_delay_data = trip_delay_data.dropna(subset=['TIMESTAMP', 'DELAY'])
print(f"{len(trip_delay_data['TIMESTAMP']) - len(filtered_trip_delay_data['DELAY'])} records removed.")

1837 records removed.


In [23]:
# Subset features (inputs) and response (outputs).
features = filtered_trip_delay_data['TIMESTAMP'].to_numpy()
response = filtered_trip_delay_data['DELAY'].to_numpy()
features

array(['2023-09-17T04:24:08.000000000', '2023-09-17T04:45:36.000000000',
       '2023-09-17T05:25:28.000000000', ...,
       '2023-09-25T07:24:52.000000000', '2023-09-25T07:27:10.000000000',
       '2023-09-26T07:25:24.000000000'], dtype='datetime64[ns]')

In [26]:
# Transform features into a numeric value for easier use.
def datetime_to_unix(input):
    return input.astype('datetime64[s]').astype('int')

transformed_features = np.apply_along_axis(datetime_to_unix, 0, features)
transformed_features

array([1694924648, 1694925936, 1694928328, ..., 1695626692, 1695626830,
       1695713124])

In [31]:
# Split the data into training and testing sets.
features_train, features_test, response_train, response_test = train_test_split(transformed_features, response, test_size=0.2, random_state=777)

# Feed training data into the model.
model = LinearRegression().fit(features_train.reshape(-1, 1), response_train)

# Spit out predictions.
preds = model.predict(features_test.reshape(-1, 1))

# Evaluate the model. Remember, a low MSE is good, and an R2 of 1.0 is good.
print("Mean squared error: %.2f" % mean_squared_error(response_test, preds))
print("Coefficient of determination: %.2f" % r2_score(response_test, preds))


Mean squared error: 50291.79
Coefficient of determination: 0.00
