In [None]:
from IPython import get_ipython
from IPython.display import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import joblib

# Load dataset
data_path = "cleaned4_data.csv"
df = pd.read_csv(data_path)

# Define feature and target columns
features = [
    "BN1-MCheek", "BN1-MJoistUp", "BN1-MJoistLo", "BN1-RPocket", "BN1-RPocket-C", "BN1-RPocket-DP", "BN1-TPocket", "BN1-TCheek", 
    "BN2-MCheek", "BN2-MJoistUp", "BN2-MJoistLo", "BN2-RPocket", "BN2-RPocket-C", "BN2-RPocket-DP", "BN2-TPocket", "BN2-TCheek", 
    "BN3-MCheek", "BN3-MJoistUp", "BN3-MJoistLo", "BN3-RPocket", "BN3-RPocket-C", "BN3-RPocket-DP", "BN3-TPocket", "BN3-TCheek",
    "BE2-MCheek", "BE2-MJoistUp", "BE2-MJoistLo", "BE2-RPocket", "BE2-RPocket-C", "BE2-RPocket-DP", "BE2-TPocket", "BE2-TCheek",
    "BS-MCheek", "BS-MJoistLeft", "BS-MJoistRight", "BS-RPocket", "BS-RPocket-C", "BS-RPocket-DP", "BS-TPocket", "BS-TCheek",
    "BW1-MCheek", "BW1-MJoistUp", "BW1-MJoistLo", "BW1-RPocket", "BW1-RPocket-C", "BW1-RPocket-DP", "BW1-TPocket", "BW1-TCheek",
    "BW2-MCheek", "BW2-MJoistUp", "BW2-MJoistLo", "BW2-RPocket", "BW2-RPocket-C", "BW2-RPocket-DP", "BW2-TPocket", "BW2-TCheek"
]

targets = {
    "BSMT-N1": ["BN1-MCheek-MC", "BN1-MJoistUp-MC", "BN1-MJoistLo-MC"],
    "BSMT-N2": ["BN2-MCheek-MC", "BN2-MJoistUp-MC", "BN2-MJoistLo-MC"],
    "BSMT-N3": ["BN3-MCheek-MC", "BN3-MJoistUp-MC", "BN3-MJoistLo-MC"],
    "BSMT-E2": ["BE2-MCheek-MC", "BE2-MJoistUp-MC", "BE2-MJoistLo-MC"],
    "BSMT-S": ["BS-MCheek-MC", "BS-MJoistLeft-MC", "BS-MJoistRight-MC"]
}

target_columns = [col for cols in targets.values() for col in cols]

df = df[features + target_columns].dropna()

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target_columns], test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features list:", features)
num_features = X_train_scaled.shape[1]  # Ensure correct input size
print(f"Number of features: {num_features}")

# Define neural network model
model = keras.Sequential([
    layers.Input(shape=(num_features,)),
    layers.Dense(64, activation='relu'), 
    layers.Dense(32, activation='relu'),
    layers.Dense(len(target_columns), activation='linear')
])

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

print("Shape of X_train_scaled:", X_train_scaled.shape)

# Train model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=16, validation_split=0.2, verbose=1)

# Plot learning curve
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.legend()
plt.show()

# Train Random Forest Model
symbolic_model = RandomForestRegressor(n_estimators=100, random_state=42)
symbolic_model.fit(X_train, y_train)

# Make predictions
nn_predictions = model.predict(X_test_scaled)
symbolic_predictions = symbolic_model.predict(X_test)

# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2 Score: {r2:.4f}")
    return mse, mae, r2

nn_metrics = evaluate_model(y_test, nn_predictions, "Neural Network")
symbolic_metrics = evaluate_model(y_test, symbolic_predictions, "Random Forest")

# Hybrid Model (Averaging Predictions)
final_predictions = (nn_predictions + symbolic_predictions) / 2
final_metrics = evaluate_model(y_test, final_predictions, "Hybrid Model")

# Store results
results_df = pd.DataFrame({
    "Model": ["Neural Network", "Random Forest", "Hybrid Model"],
    "MSE": [nn_metrics[0], symbolic_metrics[0], final_metrics[0]],
    "MAE": [nn_metrics[1], symbolic_metrics[1], final_metrics[1]],
    "R2 Score": [nn_metrics[2], symbolic_metrics[2], final_metrics[2]]
})

print("\nModel Evaluation Results:")
print(results_df)

# Save models
model.save("neural_model.h5")
joblib.dump(symbolic_model, "symbolic_model.pkl")
joblib.dump(scaler, "scaler.pkl")


ModuleNotFoundError: No module named 'sklearn'