## Setup

In [None]:
import pandas as pd
import tensorflow as tf
from keras import callbacks, optimizers, Sequential
from keras.layers import Dense, Dropout, Input # type: ignore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

: 

In [None]:
# df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bikes.csv')
df = pd.read_csv('bikes.csv')
df.head() 

## Data Transformation

In [None]:
def split_date(df: pd.DataFrame) -> pd.DataFrame:  
    # Extract year, month, and day into separate columns and convert to numbers (rather than strings)
    df[['month', 'day', 'year']] = df['dteday'].str.extract(r'(\d*)/(\d*)/(\d*)')
    df['month'] = pd.to_numeric(df['month'])
    df['day'] = pd.to_numeric(df['day'])
    df['year'] = pd.to_numeric(df['year'])
    # Don't need the garbage format date column, throw it away
    df.drop(columns=['dteday'], errors='ignore', inplace=True)
    return df

# df = split_date(df)
# df.head(220) # confirm that one- and two-digit months and days are handled correctly
# df.info()

In [None]:
def convert_categorical(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encode the categorical variables
    df['weathersit'] = df['weathersit'].map({1: 'none', 2: 'light', 3: 'moderate', 4: 'heavy'})
    df['season'] = df['season'].map({1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'})
    df = pd.get_dummies(df, columns=['season', 'weathersit'], prefix=['','weather'], prefix_sep=['','_'], dtype=int)
    return df

# df = convert_categorical(df)
# df.head()

In [None]:
df = pd.read_csv('bikes.csv')
df = split_date(df)
df = convert_categorical(df)
# df.info()
df.head()

## Daylight Savings Exploration

As I was exploring the data, I found out that the hour of 2 a.m. has 13 less entries than all of the other ones, which was weird to me. Upon further examination, the "missing" 2 a.m. entry occurred in March of each year, which led me to believe it might be because of Daylight Savings. Turns out, that's entirely correct: March 13, 2011 was Daylight Savings, and that day is missing an entry for 2 a.m.

This does, however, raise the question of why there isn't a *duplicate* 2 a.m. entry each November...

In [None]:
df['hr'].value_counts() # 2 a.m. has 13 less entries than the other hours

In [None]:
# max_year = df['year'].max()
# min_year = df['year'].min()
# max_month = df[df['year'] == max_year]['month'].max()
# min_month = df[df['year'] == min_year]['month'].min()
# num_months = (max_year - min_year) * 12 + max_month - min_month + 1

one_day_per_month = df[(df['day'] == 1) & (df['hr'] == 0.0)][['month', 'year']].copy()
num_months = one_day_per_month.shape[0]
num_years = one_day_per_month['year'].nunique()
od = one_day_per_month
num_29 = od[(od['month'] != 2) | ((od['month'] == 2) & (od['year'] % 4 == 0))].shape[0]
num_30 = od[(od['month'] != 2)].shape[0]
num_31 = od[(od['month'] != 2) & (od['month'] != 4) & (od['month'] != 6) & (od['month'] != 9) & (od['month'] != 11)].shape[0]

print(f'{num_months} total months in the dataset.') # 154
print(f'{num_29} have at least 29 days.') # 144
print(f'{num_30} have at least 30 days.') # 141
print(f'{num_31} have 31 days.') # 90

In [None]:
just_1am = df[df['hr'] == 1]
# just_1am['day'].value_counts()
# This has the expected number of entries.
just_1am_filtered = just_1am[(just_1am['day'] == 8) | (just_1am['day'] == 9) | (just_1am['day'] == 10) | (just_1am['day'] == 11) | (just_1am['day'] == 12) | (just_1am['day'] == 13) | (just_1am['day'] == 14)]
just_1am_filtered = just_1am_filtered[(just_1am_filtered['month'] == 3)]
just_1am_filtered['day'].value_counts()

In [None]:
just_2am = df[df['hr'] == 2]
# just_2am['day'].value_counts()
just_2am_filtered = just_2am[(just_2am['day'] == 8) | (just_2am['day'] == 9) | (just_2am['day'] == 10) | (just_2am['day'] == 11) | (just_2am['day'] == 12) | (just_2am['day'] == 13) | (just_2am['day'] == 14)]
just_2am_filtered = just_2am_filtered[(just_2am_filtered['month'] == 3)]
just_2am_filtered['day'].value_counts()
# just_2am_filtered[['month', 'day', 'year']].to_csv('just_2am_filtered.csv', index=False)

In [None]:
dmy_2am = just_2am[(just_2am['day'] == 8) | (just_2am['day'] == 9) | (just_2am['day'] == 10) | (just_2am['day'] == 11) | (just_2am['day'] == 12) | (just_2am['day'] == 13) | (just_2am['day'] == 14)]
dmy_2am = dmy_2am[(dmy_2am['month'] == 3)]
dmy_2am = dmy_2am[['day', 'year']]
# dmy_2am.head()

daylight_savings_days = {}
for year in range(df['year'].min(), df['year'].max() + 1):
    for day in range(8, 15):
        if day not in dmy_2am[dmy_2am['year'] == year].to_numpy():
            daylight_savings_days[year] = day

for year, day in daylight_savings_days.items():
    print(f'{year}: March {day}')

# These are all correct.

## Model Generation

In [None]:
bikes = pd.read_csv('bikes.csv')
bikes = split_date(bikes)
bikes = convert_categorical(bikes)
# bikes.info()
bikes.head()

In [None]:
X = bikes.drop(columns=['casual', 'registered'])
y = bikes[['casual', 'registered']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape)

In [None]:
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2))

In [None]:
opt = optimizers.Adam()
model.compile(loss='mean_squared_error', optimizer=opt, metrics=['mean_absolute_percentage_error'])

In [None]:
print("Num GPUs Available:", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_logical_devices('GPU'))

In [None]:
print(tf.__version__)

In [None]:
early_stop = callbacks.EarlyStopping(patience=30)

history = model.fit(X_train, y_train, epochs=2000, validation_split=.35, batch_size=20, callbacks=[early_stop], shuffle=False)
# history = model.fit(train_features, train_labels, epochs=2000, verbose=0, validation_split = .2, batch_size=tester2,
#                     callbacks=[early_stop, PrintDot()])

hist = pd.DataFrame(history.history)

## Mini Holdout

In [None]:
mini = pd.read_csv('bikes_holdout_mini.csv')
mini = split_date(mini)
mini = convert_categorical(mini)
# mini.info()
mini.head()

In [None]:
mini = mini.reindex(columns = X.columns, fill_value=0)
mini_X = scaler.transform(mini)