In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import joblib
import shap
import matplotlib.pyplot as plt

In [None]:
# Load your dataset
file_path = 'aiml_tg_prediction.xlsx'
df = pd.read_excel(file_path)

# Check for missing values
print(df.isnull().sum())

In [None]:
# Define the target and features
target_col = 'experimental_tg'
features = df.drop(columns=[target_col])
X = features.select_dtypes(include=[np.number])  # Only numerical features
y = df[target_col]

In [None]:
# First split: train (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: validation (10%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Set parameters
params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'seed': 42
}

# Train using the native XGBoost API
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=[(dval, 'validation')],
    early_stopping_rounds=10,
    verbose_eval=True
)

In [None]:
# Convert test set to DMatrix
dtest = xgb.DMatrix(X_test)

# Predict
y_pred = xgb_model.predict(dtest)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test R²: {r2:.2f}")

In [None]:
xgb_model.save_model("xgb_tg_predictor.json")

In [None]:
# SHAP analysis
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Predict on test set
dtest = xgb.DMatrix(X_test)
y_pred = xgb_model.predict(dtest)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Test MSE: {mse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test R²: {r2:.2f}")

In [None]:
plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test, y=y_pred, s=60, color='crimson', edgecolor='black')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel("Actual Tg")
plt.ylabel("Predicted Tg")
plt.title("XGBoost: Actual vs Predicted Tg")
plt.grid(True)
plt.tight_layout()
plt.savefig("xgb_tg_actual_vs_pred.png", dpi=300)
plt.show()

In [None]:
# SHAP values
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test)

# SHAP summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar", show=True)

In [None]:
results_df = pd.DataFrame({
    "Actual_Tg": y_test.values,
    "Predicted_Tg": y_pred
})

results_df.to_excel("xgb_tg_predictions.xlsx", index=False)

In [None]:
xgb_model.save_model("xgb_tg_model.json")

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(y_test, bins=20, kde=True, color="skyblue", label="Actual Tg", stat="density", linewidth=0)
sns.histplot(y_pred, bins=20, kde=True, color="salmon", label="Predicted Tg", stat="density", linewidth=0)
plt.xlabel("Glass Transition Temperature (Tg)")
plt.ylabel("Density")
plt.title("Distribution of Actual vs Predicted Tg")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("xgb_tg_distribution_kde.png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.violinplot(data=[y_test, y_pred], palette=["skyblue", "salmon"])
plt.xticks([0, 1], ["Actual Tg", "Predicted Tg"])
plt.ylabel("Tg")
plt.title("Violin Plot of Tg Distribution")
plt.grid(True)
plt.tight_layout()
plt.savefig("xgb_tg_violin_plot.png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(data=[y_test, y_pred], palette=["skyblue", "salmon"])
plt.xticks([0, 1], ["Actual Tg", "Predicted Tg"])
plt.ylabel("Tg")
plt.title("Box Plot of Tg: Actual vs Predicted")
plt.grid(True)
plt.tight_layout()
plt.savefig("xgb_tg_box_plot.png", dpi=300)
plt.show()

In [None]:
import shap

# Ensure SHAP values are computed
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test)

# Force plot for the first test sample
shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value,
    shap_values=shap_values[0].values,
    features=X_test.iloc[0],
    feature_names=X_test.columns
)

In [None]:
shap.save_html(
    "shap_force_plot_sample0.html",
    shap.force_plot(
        base_value=explainer.expected_value,
        shap_values=shap_values[0].values,
        features=X_test.iloc[0],
        feature_names=X_test.columns
    )
)

In [None]:
# Example scores
r2_train = 0.97
r2_test = 0.92

plt.figure(figsize=(5, 5))
plt.bar(['Train', 'Test'], [r2_train, r2_test], color=['skyblue', 'lightgreen'])
plt.ylim(0, 1)
plt.ylabel("R² Score")
plt.title("R² Comparison - Train vs Test (RF)")
plt.tight_layout()
plt.savefig("rf_r2_comparison.png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df["experimental_tg"], bins=np.arange(-60, 180, 10), color="lightcoral", label="Tg Distribution")
plt.xlabel("Glass Transition Temperature (Tg)")
plt.ylabel("Frequency")
plt.title("Histogram of Tg Values")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("tg_histogram_full_range.png", dpi=300)
plt.show()

In [None]:
labels = ['Train', 'Validation', 'Test']
r2 = [0.91, 0.85, 0.87]
mae = [5.4, 15.8, 11.0]
rmse = [7.4, 20.2, 14.0]

x = np.arange(len(labels))
width = 0.25

plt.figure(figsize=(10, 6))
plt.bar(x - width, r2, width, label='R²', color='cornflowerblue')
plt.bar(x, mae, width, label='MAE', color='orange')
plt.bar(x + width, rmse, width, label='RMSE', color='green')

plt.ylabel('Metric Value')
plt.title('Model Performance Across Datasets')
plt.xticks(x, labels)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("model_metric_comparison.png", dpi=300)
plt.show()

In [None]:
sorted_idx = np.argsort(y_test)
plt.figure(figsize=(10, 6))
plt.plot(np.arange(len(y_test)), y_test.values[sorted_idx], label="Actual Tg", linewidth=2)
plt.plot(np.arange(len(y_pred)), y_pred[sorted_idx], '--', label="Predicted Tg", linewidth=2)
plt.xlabel("Sample Index (Sorted by Actual Tg)")
plt.ylabel("Tg")
plt.title("Actual vs Predicted Tg (Sorted)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("actual_vs_pred_sorted.png", dpi=300)
plt.show()

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(6, 5))
plt.hist(residuals, bins=15, color='gray', edgecolor='black')
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.title("Residual Distribution")
plt.grid(True)
plt.tight_layout()
plt.savefig("residual_histogram.png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(6, 5))
sns.regplot(x=y_test, y=y_pred, ci=95, scatter_kws={"s": 50})
plt.xlabel("experimental_tg")
plt.ylabel("predicted_tg")
plt.title("Linear Fit: Experimental vs Predicted Tg")
plt.tight_layout()
plt.savefig("regression_fit_plot.png", dpi=300)
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Example metrics (replace with actual)
metrics_df = pd.DataFrame({
    'Dataset': ['Train', 'Validation', 'Test'],
    'R2': [0.91, 0.85, 0.87],
    'MAE': [5.4, 15.8, 11.0],
    'RMSE': [7.4, 20.2, 14.0]
})

# Tg histogram binning
bins = np.arange(-60, 180, 10)
hist, bin_edges = np.histogram(df["experimental_tg"], bins=bins)
tg_distribution_df = pd.DataFrame({
    'Bin_Start': bin_edges[:-1],
    'Bin_End': bin_edges[1:],
    'Frequency': hist
})

# Residuals DataFrame
residuals = y_test - y_pred
residuals_df = pd.DataFrame({
    'Actual_Tg': y_test.values,
    'Predicted_Tg': y_pred,
    'Residual': residuals
})

# Save all to Excel
with pd.ExcelWriter("xgb_tg_statistics.xlsx") as writer:
    metrics_df.to_excel(writer, sheet_name="Model_Metrics", index=False)
    tg_distribution_df.to_excel(writer, sheet_name="Tg_Distribution", index=False)
    residuals_df.to_excel(writer, sheet_name="Prediction_Residuals", index=False)

print("✅ Excel summary saved as xgb_tg_statistics.xlsx")