In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [4]:
dataset_path = 'newoutput1.csv'
df = pd.read_csv(dataset_path)

missing_values = df.isnull().sum()
df.fillna(df.mean(numeric_only=True), inplace=True)

In [5]:
label_encoder = LabelEncoder()
df['Ethnicity'] = label_encoder.fit_transform(df['Ethnicity'])
encoder = OneHotEncoder(sparse=False)
ethnicity_encoded = encoder.fit_transform(df[['Ethnicity']])
ethnicity_encoded_df = pd.DataFrame(ethnicity_encoded, columns=[f'Ethnicity_{i}' for i in range(ethnicity_encoded.shape[1])])
df = pd.concat([df, ethnicity_encoded_df], axis=1)



In [6]:
def time_to_minutes(time_str):
    try:
        hours, minutes = map(int, str(time_str).split(':'))
        return hours * 60 + minutes
    except:
        return 0

In [7]:
df['Drink Start Time'] = df['Drink Start Time'].apply(time_to_minutes)

df['BMI'] = (df['Weight (lbs)'] / ((df['Height (ft)'] * 12) + df['Height (in)']) ** 2) * 703

drink_start_time = df['Drink Start Time']

selected_features = df[['nth drink', '# of standard drinks', 'Alcohol Content (g)', 'Time since drink (min)',
                        'Height (ft)', 'Height (in)', 'Weight (lbs)', 'BMI', 'Empty Stomach?', 'Age', 'Sex', 'Ethnicity_0', 'Ethnicity_1', 'Ethnicity_2']]

In [8]:
numeric_features = selected_features.select_dtypes(include=[np.number])
numeric_feature_names = numeric_features.columns.tolist()

scaler = StandardScaler()
numeric_features = scaler.fit_transform(numeric_features)

numeric_features_df = pd.DataFrame(numeric_features, columns=numeric_feature_names)

features = pd.concat([numeric_features_df, drink_start_time], axis=1)

In [9]:
def create_sequences(data, sequence_length):
    sequences = []
    target = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i+sequence_length])
        target.append(data[i+sequence_length])
    return np.array(sequences), np.array(target)

In [10]:
sequence_length = 10
X, y = create_sequences(df['Measured BAC'].values, sequence_length)

In [11]:
train_size = int(0.7 * len(X))
val_size = int(0.2 * len(X))
X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]

In [12]:

model = Sequential()
model.add(LSTM(64, input_shape=(sequence_length, 1), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))  
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))  
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error')

In [13]:
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val))

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error on Test Set: 0.002171476567392119
