In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import joblib

# Load the dataset
file_path = 'cleaned4_data.csv'
df = pd.read_csv(file_path)

# Define features and target variables
features = [
    "BN1-MCheek", "BN1-MJoistUp", "BN1-MJoistLo", "BN1-RPocket", "BN1-RPocket-C", "BN1-RPocket-DP", "BN1-TPocket", "BN1-TCheek",
    "BN2-MCheek", "BN2-MJoistUp", "BN2-MJoistLo", "BN2-RPocket", "BN2-RPocket-C", "BN2-RPocket-DP", "BN2-TPocket", "BN2-TCheek",
    "BN3-MCheek", "BN3-MJoistUp", "BN3-MJoistLo", "BN3-RPocket", "BN3-RPocket-C", "BN3-RPocket-DP", "BN3-TPocket", "BN3-TCheek",
    "BE2-MCheek", "BE2-MJoistUp", "BE2-MJoistLo", "BE2-RPocket", "BE2-RPocket-C", "BE2-RPocket-DP", "BE2-TPocket", "BE2-TCheek",
    "BS-MCheek", "BS-MJoistLeft", "BS-MJoistRight", "BS-RPocket", "BS-RPocket-C", "BS-RPocket-DP", "BS-TPocket", "BS-TCheek",
    "BW1-MCheek", "BW1-MJoistUp", "BW1-MJoistLo", "BW1-RPocket", "BW1-RPocket-C", "BW1-RPocket-DP", "BW1-TPocket", "BW1-TCheek",
    "BW2-MCheek", "BW2-MJoistUp", "BW2-MJoistLo", "BW2-RPocket", "BW2-RPocket-C", "BW2-RPocket-DP", "BW2-TPocket", "BW2-TCheek"
]

targets = {
    "BSMT-N1": ["BN1-MCheek-MC", "BN1-MJoistUp-MC", "BN1-MJoistLo-MC"],
    "BSMT-N2": ["BN2-MCheek-MC", "BN2-MJoistUp-MC", "BN2-MJoistLo-MC"],
    "BSMT-N3": ["BN3-MCheek-MC", "BN3-MJoistUp", "BN3-MJoistLo"],
    "BSMT-E2": ["BE2-MCheek-MC", "BE2-MJoistUp-MC", "BE2-MJoistLo-MC"],
    "BSMT-S": ["BS-MCheek-MC", "BS-MJoistLeft-MC", "BS-MJoistRight-MC"]
}

# Drop rows with missing values
df = df.dropna(subset=features + sum(targets.values(), []))

# Split into training and testing sets
X = df[features].values
y = df[sum(targets.values(), [])].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler

# Reshape input data for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Define Hybrid Model (LSTM + Dense layers)
hybrid_model = Sequential([
    LSTM(64, activation='tanh', return_sequences=False, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(y.shape[1], activation='linear')
])

# Compile the model
hybrid_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Train the model
history = hybrid_model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

# Save the trained model
hybrid_model.save('hybrid_model.h5')

# Evaluate the model
loss, mae = hybrid_model.evaluate(X_test, y_test)
y_pred = hybrid_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Test Loss: {loss}, Test MAE: {mae}, R2 Score: {r2}")

# Plot training loss and validation loss
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Over Epochs')
plt.show()

# Create a table of results
eval_results = pd.DataFrame({
    'Metric': ['Loss', 'Mean Absolute Error', 'R2 Score'],
    'Value': [loss, mae, r2]
})
eval_results.to_csv('model_results.csv', index=False)  # Save results to CSV
print(eval_results)