In [None]:
import pandas as pd
import numpy as np 
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# --- 1. Load and Prepare the Data ---
df = pd.read_excel('Final_Without_IV.xlsx')
X = df.drop('Option_Price_C', axis=1)
y = df['Option_Price_C']

# --- CHANGED: Apply the log(1+x) transformation to the target variable ---
# This helps the model handle the wide range of option prices more effectively.
y_log = np.log1p(y)
print("Applied log(1+x) transformation to the target variable 'y'.")


# --- 2. Split and Scale the Data ---
# --- CHANGED: Use y_log for splitting. Keep original y for final comparison. ---
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Keep the original, untransformed y_test for final comparison
y_test = y.loc[y_test_log.index]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# --- 3. Build and Compile the Model ---
# This part remains the same. The model will now learn to predict the log-price.
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=0.000005,
    verbose=1
)
model.summary()


# --- 4. Train the Model ---
# --- CHANGED: Train the model on the log-transformed y_train_log ---
history = model.fit(
    X_train_scaled,
    y_train_log, # Train on log-transformed values
    epochs=100,
    validation_split=0.2,
    callbacks=[lr_scheduler],
    verbose=1
)
print("Model training finished.")

model.save('option_log_price_predictor.keras')
print("Model saved successfully to 'option_log_price_predictor.keras'")


# --- 5. Evaluate and Predict ---

# First, evaluate the model on the log-transformed test data
loss, rmse_log = model.evaluate(X_test_scaled, y_test_log, verbose=0)
print(f"\nTest Set RMSE on Log-Transformed Prices: {rmse_log:.4f}")

# Make predictions (these will be in the log scale)
log_predictions = model.predict(X_test_scaled).flatten()

# --- >>> BIAS CORRECTION STARTS HERE <<< ---

# --- ADDED: Get the variance of the error (sigma^2) from the validation set ---
# The model's loss ('mean_squared_error') on the validation set is the best
# estimate of its error variance on unseen data.
sigma_sq = history.history['val_loss'][-1]
print(f"\nUsing final validation MSE for bias correction: {sigma_sq:.4f}")

# --- ADDED: Apply the correction factor to the log predictions ---
# This adjusts the prediction from the median to the mean.
corrected_log_predictions = log_predictions + 0.5 * sigma_sq

# --- CHANGED: Inverse transform both the original and the corrected predictions ---
# We'll do this to see the impact of the correction.
predictions_biased = np.expm1(log_predictions)
predictions_corrected = np.expm1(corrected_log_predictions)

# --- CHANGED: Calculate and compare the RMSE for both versions ---
rmse_biased = np.sqrt(mean_squared_error(y_test, predictions_biased))
rmse_corrected = np.sqrt(mean_squared_error(y_test, predictions_corrected))

print(f"\nRMSE of Biased (uncorrected) Predictions: ${rmse_biased:.2f}")
print(f"RMSE of Corrected Predictions:           ${rmse_corrected:.2f}")

improvement = ((rmse_biased - rmse_corrected) / rmse_biased) * 100
print(f"Improvement from correction: {improvement:.2f}%")

# --- >>> BIAS CORRECTION ENDS HERE <<< ---

# --- CHANGED: Show comparison with original prices and both prediction types ---
results = pd.DataFrame({
    'Actual_Price': y_test,
    'Predicted_Biased': predictions_biased,
    'Predicted_Corrected': predictions_corrected
})
print("\nSample Predictions (in original dollar scale):")
print(results.head(20))

Applied log(1+x) transformation to the target variable 'y'.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 1.6104 - root_mean_squared_error: 1.1289 - val_loss: 0.1808 - val_root_mean_squared_error: 0.4252 - learning_rate: 0.0010
Epoch 2/100
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.1396 - root_mean_squared_error: 0.3735 - val_loss: 0.1189 - val_root_mean_squared_error: 0.3449 - learning_rate: 0.0010
Epoch 3/100
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.1265 - root_mean_squared_error: 0.3556 - val_loss: 0.1072 - val_root_mean_squared_error: 0.3275 - learning_rate: 0.0010
Epoch 4/100
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - loss: 0.1186 - root_mean_squared_error: 0.3444 - val_loss: 0.1003 - val_root_mean_squared_error: 0.3168 - learning_rate: 0.0010
Epoch 5/100
[1m3063/3063[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 0.1118 - root_mean_squared