In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your Excel file
df = pd.read_excel("aiml_tg_prediction.xlsx")
df.columns = df.columns.str.strip()

# Drop text columns
df = df.drop(columns=[
    'coamorphous_system', 'molecular_formula_drug', 'smiles_drug',
    'molecular_formula_coformer', 'smiles_coformer'
])

# Drop rows with missing values
df.dropna(inplace=True)

# Split features and target
X = df.drop(columns=['experimental_tg'])
y = df['experimental_tg']

# First split: 80% train, 20% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: 10% validation, 10% test from temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras import regularizers

model = Sequential([
    Input(shape=(X_train_scaled.shape[1],)),
    Dense(512, activation='selu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.3),
    Dense(256, activation='selu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.3),
    Dense(128, activation='selu', kernel_regularizer=regularizers.l2(0.001)),
    Dense(1)
])

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(monitor='val_loss', patience=1000, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=500, min_lr=1e-6)

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=6000,
    batch_size=1000,   # NEW: increased batch size
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

In [None]:
# Predict on test set
y_pred = model.predict(X_test_scaled).flatten()

# Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on test set
y_pred = model.predict(X_test_scaled)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on test set
y_pred = model.predict(X_test_scaled)

# Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.3f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Tg')
plt.ylabel('Predicted Tg')
plt.title('Actual vs Predicted Tg')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
residuals = y_test - y_pred.reshape(-1)

plt.figure(figsize=(6,4))
plt.hist(residuals, bins=25, color='orange', edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residuals Distribution')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Tg')
plt.ylabel('Predicted Tg')
plt.title('Actual vs Predicted Tg')
plt.grid(True)
plt.tight_layout()
plt.savefig("actual_vs_predicted_tg.png", dpi=600)
plt.show()

In [None]:
residuals = y_test - y_pred.reshape(-1)

plt.figure(figsize=(6,4))
plt.hist(residuals, bins=25, color='orange', edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residuals Distribution')
plt.grid(True)
plt.tight_layout()
plt.savefig("residuals_distribution.png", dpi=600)
plt.show()


In [None]:
plt.figure(figsize=(12,5))

# Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss During Training')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# MAE
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE During Training')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig("training_curves.png", dpi=600)
plt.show()


In [None]:
# Residuals vs Predicted
residuals = y_test - y_pred.reshape(-1)

plt.figure(figsize=(6, 4))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Tg')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Tg')
plt.grid(True)
plt.tight_layout()
plt.savefig("residuals_vs_predicted.png", dpi=600)
plt.show()


In [None]:
import shap

# Wrap model for SHAP — use 100 samples if needed to speed up
X_summary = pd.DataFrame(X_train_scaled, columns=X.columns)

# KernelExplainer for general models
explainer = shap.Explainer(model.predict, X_summary)
shap_values = explainer(X_summary)

# SHAP summary plot
shap.summary_plot(shap_values, X_summary, max_display=15)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

errors = y_test - y_pred.reshape(-1)

plt.figure(figsize=(6, 4))
sns.kdeplot(errors, shade=True, color="darkblue")
plt.axvline(0, linestyle='--', color='red')
plt.title("Prediction Error Density")
plt.xlabel("Error (Actual - Predicted)")
plt.tight_layout()
plt.savefig("prediction_error_density.png", dpi=600)
plt.show()

In [None]:
import scipy.stats as stats

residuals = y_test - y_pred.reshape(-1)

plt.figure(figsize=(6, 4))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Q-Q Plot of Residuals")
plt.tight_layout()
plt.savefig("qq_plot_residuals.png", dpi=600)
plt.show()

In [None]:
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

r2_train = r2_score(y_train, model.predict(X_train_scaled))
r2_test = r2_score(y_test, model.predict(X_test_scaled))

plt.bar(['Train', 'Test'], [r2_train, r2_test], color=['skyblue', 'lightgreen'])
plt.ylim(0, 1)
plt.ylabel('R² Score')
plt.title('R² Comparison - Train vs Test (ANN)')
plt.tight_layout()
plt.savefig("r2_comparison_ann.png", dpi=600)
plt.show()

In [None]:
import numpy as np

# Predictions
y_train_pred = model.predict(X_train_scaled)
y_val_pred = model.predict(X_val_scaled)
y_test_pred = model.predict(X_test_scaled)

# Metrics
metrics = {
    "Train": [r2_score(y_train, y_train_pred), mean_absolute_error(y_train, y_train_pred), np.sqrt(mean_squared_error(y_train, y_train_pred))],
    "Validation": [r2_score(y_val, y_val_pred), mean_absolute_error(y_val, y_val_pred), np.sqrt(mean_squared_error(y_val, y_val_pred))],
    "Test": [r2_score(y_test, y_test_pred), mean_absolute_error(y_test, y_test_pred), np.sqrt(mean_squared_error(y_test, y_test_pred))]
}

labels = list(metrics.keys())
r2_vals = [v[0] for v in metrics.values()]
mae_vals = [v[1] for v in metrics.values()]
rmse_vals = [v[2] for v in metrics.values()]

x = np.arange(len(labels))
width = 0.25

plt.figure(figsize=(10, 6))
plt.bar(x - width, r2_vals, width, label='R²', color='cornflowerblue')
plt.bar(x, mae_vals, width, label='MAE', color='orange')
plt.bar(x + width, rmse_vals, width, label='RMSE', color='green')
plt.xticks(x, labels)
plt.ylabel('Metric Value')
plt.title('Model Performance Across Datasets')
plt.legend()
plt.tight_layout()
plt.savefig("performance_across_datasets.png", dpi=600)
plt.show()

In [None]:
sorted_idx = np.argsort(y_test)
plt.figure(figsize=(10,6))
plt.plot(np.arange(len(y_test)), y_test.values[sorted_idx], label='Actual Tg', linewidth=2)
plt.plot(np.arange(len(y_test)), y_pred[sorted_idx], label='Predicted Tg', linestyle='--', color='orange')
plt.xlabel('Sample Index (Sorted by Actual Tg)')
plt.ylabel('Tg')
plt.title('Actual vs Predicted Tg (Sorted)')
plt.legend()
plt.tight_layout()
plt.savefig("lineplot_actual_vs_pred.png", dpi=600)
plt.show()

In [None]:
residuals = y_test - y_pred.reshape(-1)

plt.figure(figsize=(6, 4))
plt.hist(residuals, bins=15, color='gray', edgecolor='black')
plt.title("Residual Distribution")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("residual_histogram.png", dpi=600)
plt.show()

In [None]:
import seaborn as sns

sns.set(style='whitegrid')
plt.figure(figsize=(6, 5))
sns.regplot(x=y_test, y=y_pred.flatten(), ci=95, line_kws={"color":"red"})
plt.xlabel("experimental_tg")
plt.ylabel("predicted_tg")
plt.title("Seaborn Regression Fit")
plt.tight_layout()
plt.savefig("regression_fit_seaborn.png", dpi=600)
plt.show()