In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
df = pd.read_excel("aiml_tg_prediction.xlsx")
print(df.shape)
df.head()

(302, 37)


Unnamed: 0,coamorphous_system,molecular_formula_drug,smiles_drug,molecular_formula_coformer,smiles_coformer,ratio_part_drug,ratio_part_coformer,molecular_weight_drug,molecular_weight_coformer,pka_drug_acid,...,bond_energy,density,cohesive_energy_density,solubility_parameter,inter_potential_energy,inter_van_der_waals_energy,inter_electrostatic_energy,total_potential_energy,total_van_der_waals_energy,total_electrostatic_energy
0,LCD-ENL,C26H33NO6,CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2/C=C/C(=O)OC(C...,C20H28N2O5,CCOC(=O)[C@H](CCC1=CC=CC=C1)N[C@@H](C)C(=O)N2C...,2,1.0,455.5,376.4,19.47,...,721.057794,0.976339,262106900.0,16.19,-826.14,-718.64,-62.7,-2037.56,-195.713817,-696.01
1,LDC-FLB_1,C14H22N2O,CCN(CC)CC(=O)NC1=C(C=CC=C1C)C,C15H13FO2,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)O,1,9.0,234.34,244.26,13.78,...,89.866847,0.999858,319158300.0,17.865003,-185.59671,-147.524713,-30.217949,-309.16217,-55.252734,-226.224629
2,LDC-FLB_2,C14H22N2O,CCN(CC)CC(=O)NC1=C(C=CC=C1C)C,C15H13FO2,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)O,3,7.0,234.34,244.26,13.78,...,104.416914,0.977514,320066900.0,17.8904,-188.8264,-147.6108,-33.211,-228.3823,-56.5217,-190.9173
3,LDC-FLB_3,C14H22N2O,CCN(CC)CC(=O)NC1=C(C=CC=C1C)C,C15H13FO2,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)O,1,1.0,234.34,244.26,13.78,...,95.303943,0.952748,329275100.0,18.145938,-197.669369,-151.414404,-38.125725,-152.916062,-59.532319,-142.752012
4,LDC-FLB_4,C14H22N2O,CCN(CC)CC(=O)NC1=C(C=CC=C1C)C,C15H13FO2,CC(C1=CC(=C(C=C1)C2=CC=CC=C2)F)C(=O)O,7,3.0,234.34,244.26,13.78,...,104.397933,0.920987,307324000.0,17.5307,-189.2715,-150.2977,-30.7895,-75.3383,-63.8401,-97.3417


In [None]:
drop_cols = [
    'coamorphous_system', 'molecular_formula_drug', 'smiles_drug',
    'molecular_formula_coformer', 'smiles_coformer'
]

# Drop SMILES and ID columns
df = df.drop(columns=drop_cols, errors='ignore')

# Drop rows with missing target
df = df.dropna(subset=['experimental_tg']).dropna()

In [None]:
X = df.drop(columns=['experimental_tg'])
y = df['experimental_tg']

# Ensure all values are float
X = X.astype(float)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

rf_preds = rf.predict(X_test)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(y_true, y_pred, name="Model"):
    print(f"{name} Evaluation")
    print(f"R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"MAE     : {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"RMSE    : {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print("")

# ✅ Now call the function properly after the definition
evaluate_model(y_test, rf_preds, "Random Forest")

In [None]:
import matplotlib.pyplot as plt

def plot_predictions(y_true, y_pred, title="Prediction vs Actual"):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, color='teal', alpha=0.7, edgecolor='k')
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    plt.xlabel("Actual Tg")
    plt.ylabel("Predicted Tg")
    plt.title(title)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_predictions(y_test, rf_preds, "Random Forest: Predicted vs Actual Tg")

In [None]:
import pandas as pd

# Create a DataFrame of actual vs predicted values
results_df = pd.DataFrame({
    'Actual_Tg': y_test.reset_index(drop=True),
    'Predicted_Tg_RF': rf_preds
})

# Export to Excel
results_df.to_excel("rf_tg_predictions.xlsx", index=False)
print("✅ Results saved to rf_tg_predictions.xlsx")

In [None]:
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt

# Save predictions
results_df = pd.DataFrame({
    'Actual_Tg': y_test.reset_index(drop=True),
    'Predicted_Tg_RF': rf_preds
})
results_df.to_excel("rf_tg_predictions.xlsx", index=False)

# Save model
joblib.dump(rf, "rf_model.pkl")

# Save test data (features + target)
X_test_df = pd.DataFrame(X_test, columns=X.columns)
X_test_df.to_excel("X_test_rf.xlsx", index=False)
y_test.to_excel("y_test_rf.xlsx", index=False)

# Plot and save feature importance
importances = rf.feature_importances_
feat_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feat_df = feat_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8, 6))
plt.barh(feat_df['Feature'], feat_df['Importance'], color='teal')
plt.xlabel("Importance")
plt.title("Random Forest Feature Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig("rf_feature_importance.png")
plt.close()

print("✅ All components saved.")

In [None]:
import pandas as pd

results_df = pd.DataFrame({
    'Actual_Tg': y_test,
    'Predicted_Tg_RF': rf_preds
})
results_df.to_excel("tg_predictions_rf.xlsx", index=False)
print("✅ Exported: tg_predictions_rf.xlsx")

In [None]:
import matplotlib.pyplot as plt

importances = rf.feature_importances_
feature_names = X_train.columns
sorted_idx = importances.argsort()[::-1]

plt.figure(figsize=(8, 6))
plt.barh(range(len(importances)), importances[sorted_idx], align='center')
plt.yticks(range(len(importances)), feature_names[sorted_idx])
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import r2_score

r2_train = r2_score(y_train, rf.predict(X_train))
r2_test = r2_score(y_test, rf_preds)

plt.figure(figsize=(6, 5))
plt.bar(["Train", "Test"], [r2_train, r2_test], color=['skyblue', 'lightgreen'])
plt.ylabel("R² Score")
plt.title("R² Comparison - Train vs Test (RF)")
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.hist(y, bins=25, color="lightcoral", alpha=0.6, label='Tg Distribution')
plt.xlabel("Glass Transition Temperature (Tg)")
plt.ylabel("Frequency")
plt.title("Histogram of Tg Values")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import shap

explainer = shap.Explainer(rf, X_train)
shap_values = explainer(X_test)

shap.plots.beeswarm(shap_values, max_display=15)

In [None]:
from sklearn.inspection import PartialDependenceDisplay

top_features = sorted_idx[:4]
PartialDependenceDisplay.from_estimator(rf, X_test, features=top_features, kind="average", grid_resolution=20)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

def evaluate_model(y_true, y_pred, name="Model"):
    print(f"\n📊 {name} Evaluation")
    print(f"R² Score: {r2_score(y_true, y_pred):.4f}")
    print(f"MAE     : {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"RMSE    : {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")

In [None]:
# Predict on each set
rf_preds_train = rf.predict(X_train)
rf_preds_val   = rf.predict(X_val)
rf_preds_test  = rf.predict(X_test)

# Evaluate
evaluate_model(y_train, rf_preds_train, "Random Forest - Training")
evaluate_model(y_val, rf_preds_val, "Random Forest - Validation")
evaluate_model(y_test, rf_preds_test, "Random Forest - Test")

In [None]:
import pandas as pd

results_df = pd.DataFrame({
    'Set': ['Train', 'Validation', 'Test'],
    'R2': [
        r2_score(y_train, rf_preds_train),
        r2_score(y_val, rf_preds_val),
        r2_score(y_test, rf_preds_test)
    ],
    'MAE': [
        mean_absolute_error(y_train, rf_preds_train),
        mean_absolute_error(y_val, rf_preds_val),
        mean_absolute_error(y_test, rf_preds_test)
    ],
    'RMSE': [
        np.sqrt(mean_squared_error(y_train, rf_preds_train)),
        np.sqrt(mean_squared_error(y_val, rf_preds_val)),
        np.sqrt(mean_squared_error(y_test, rf_preds_test))
    ]
})

results_df.to_excel("model_performance_all_sets.xlsx", index=False)
print("✅ Model performance saved to model_performance_all_sets.xlsx")

In [None]:
import matplotlib.pyplot as plt

# Labels and values
sets = ['Train', 'Validation', 'Test']
r2_vals = results_df['R2']
mae_vals = results_df['MAE']
rmse_vals = results_df['RMSE']

x = np.arange(len(sets))  # label locations
width = 0.25  # width of bars

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(x - width, r2_vals, width, label='R²', color='steelblue')
plt.bar(x, mae_vals, width, label='MAE', color='orange')
plt.bar(x + width, rmse_vals, width, label='RMSE', color='green')

# Annotations & styling
plt.ylabel('Metric Value')
plt.title('Model Performance Across Datasets')
plt.xticks(x, sets)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
def tg_range_category(tg):
    if tg <= 450:
        return "Low"
    elif tg <= 1150:
        return "Medium"
    else:
        return "High"

# Apply categories
results_df = pd.DataFrame({
    'Actual_Tg': y_test.reset_index(drop=True),
    'Predicted_Tg': rf_preds
})
results_df["Tg_Range"] = results_df["Actual_Tg"].apply(tg_range_category)

# Evaluate performance in each range
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

for group, df_group in results_df.groupby("Tg_Range"):
    r2 = r2_score(df_group["Actual_Tg"], df_group["Predicted_Tg"])
    mae = mean_absolute_error(df_group["Actual_Tg"], df_group["Predicted_Tg"])
    rmse = np.sqrt(mean_squared_error(df_group["Actual_Tg"], df_group["Predicted_Tg"]))
    print(f"📊 {group} Tg range:")
    print(f"R²   = {r2:.4f}")
    print(f"MAE  = {mae:.4f}")
    print(f"RMSE = {rmse:.4f}\n")
# Your current code already loops by Tg range
# Simply ensure all three groups appear in the print output or bar plot

In [None]:
import matplotlib.pyplot as plt

errors = results_df["Predicted_Tg"] - results_df["Actual_Tg"]

plt.figure(figsize=(6,4))
plt.scatter(results_df["Actual_Tg"], errors, alpha=0.7, color='purple')
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Actual Tg")
plt.ylabel("Prediction Error (Predicted - Actual)")
plt.title("Residual Plot")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
sorted_df = results_df.sort_values("Actual_Tg").reset_index(drop=True)

plt.figure(figsize=(8,5))
plt.plot(sorted_df["Actual_Tg"], label="Actual Tg", linewidth=2)
plt.plot(sorted_df["Predicted_Tg"], label="Predicted Tg", linewidth=2, linestyle='--')
plt.xlabel("Sample Index (Sorted by Actual Tg)")
plt.ylabel("Tg")
plt.title("Actual vs Predicted Tg (Sorted)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
results_df["Absolute_Error"] = abs(results_df["Actual_Tg"] - results_df["Predicted_Tg"])
if "smiles_drug" in df.columns:
    results_df["SMILES"] = df.loc[y_test.index, "smiles_drug"].reset_index(drop=True)

results_df.to_csv("tg_results_detailed.csv", index=False)
print("✅ Exported: tg_results_detailed.csv")

In [None]:
import shap
explainer = shap.Explainer(rf.predict, X_train)
shap_values = explainer(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
errors = y_test - rf_preds
plt.hist(errors, bins=20, color='gray')
plt.title("Residual Distribution")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.show()

In [None]:
sns.regplot(x=y_test, y=rf_preds, ci=95)

In [None]:
import joblib
joblib.dump(rf, 'final_rf_model.pkl')

In [None]:
plt.savefig("rf_scatter_plot.png", dpi=600)

In [None]:
import joblib

# Save the feature names used to train the model
joblib.dump(X.columns.tolist(), "rf_model_columns.pkl")

In [None]:
feature_names = joblib.load("rf_model_columns.pkl")

In [None]:
import pandas as pd
import joblib

# Load your model and its training feature names
model = joblib.load("final_rf_model.pkl")
feature_names = joblib.load("rf_model_columns.pkl")

# Load the real coamorphous system data (AA_CAM)
df_aa = pd.read_excel("ai_ml_tg_prediction_aa_cam.xlsx")

# Select and format only numeric features
X_aa = df_aa.reindex(columns=feature_names, fill_value=0)
X_aa = X_aa.apply(pd.to_numeric, errors="coerce").fillna(0)

# Predict Tg
df_aa["Predicted_Tg"] = model.predict(X_aa)

# Save results
df_aa.to_excel("Predicted_Tg_aa_cam.xlsx", index=False)
print("✅ Predicted Tg values saved to Predicted_Tg_aa_cam.xlsx")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
plt.hist(df_aa["Predicted_Tg"], bins=20, color="skyblue", edgecolor="black")
plt.xlabel("Predicted Tg")
plt.ylabel("Frequency")
plt.title("Predicted Glass Transition Temperature Distribution")
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import joblib

# Load the training dataset
df_train = pd.read_excel("aiml_tg_prediction.xlsx")

# Drop non-feature columns (adjust if needed)
non_feature_cols = [
    "coamorphous_system", "molecular_formula_drug", "smiles_drug",
    "molecular_formula_coformer", "smiles_coformer", "experimental_tg"
]

X_train = df_train.drop(columns=non_feature_cols, errors="ignore")
X_train = X_train.apply(pd.to_numeric, errors="coerce").fillna(0)

# Save the column names used during training
joblib.dump(X_train.columns.tolist(), "rf_model_columns.pkl")
print("✅ Saved feature column names as rf_model_columns.pkl")

In [None]:
import pandas as pd
import joblib

# Load model and training column names
model = joblib.load("rf_model.pkl")
feature_names = joblib.load("rf_model_columns.pkl")

# Load real coamorphous input (aa_cam)
df = pd.read_excel("ai_ml_tg_prediction_aa_cam.xlsx")

# Ensure features match
X = df.reindex(columns=feature_names, fill_value=0)
X = X.apply(pd.to_numeric, errors="coerce").fillna(0)

# Predict Tg
df["Predicted_Tg"] = model.predict(X)

# Export
df.to_excel("Predicted_Tg_aa_cam_descriptors.xlsx", index=False)
print("✅ Tg predictions saved to: Predicted_Tg_aa_cam_descriptors.xlsx")

In [None]:
import pandas as pd
import joblib

# Load your trained model and feature column names
model = joblib.load("rf_model.pkl")  # or "final_rf_model.pkl" — depending on what you last saved
feature_names = joblib.load("rf_model_columns.pkl")

# Load the updated AA_CAM descriptor Excel file
df = pd.read_excel("ai_ml_tg_prediction_aa_cam.xlsx")

# Reorder and clean columns to match model input
X = df.reindex(columns=feature_names, fill_value=0)
X = X.apply(pd.to_numeric, errors="coerce").fillna(0)

df["Predicted_Tg"] = model.predict(X)

In [None]:
df.to_excel("Predicted_Tg_aa_cam_descriptors2.xlsx", index=False)