In [11]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance
import joblib
import warnings

warnings.filterwarnings('ignore')

# ----- CONFIG -----
DATA_PATH = "Dataset for model feed - MgO C.xlsx"
OUT_DIR = "model_outputs"
RANDOM_STATE = 42
TEST_SIZE = 0.20
CV_FOLDS = 5
os.makedirs(OUT_DIR, exist_ok=True)

# ----- LOAD -----
try:
    df = pd.read_excel(DATA_PATH)
except FileNotFoundError:
    print(f"Error: The file '{DATA_PATH}' was not found.")
    exit()

# ----- STANDARDIZE COLUMN NAMES -----
def standardize_columns(df):
    cols = df.columns.str.strip().str.lower()
    cols = cols.str.replace('%', '_pct', regex=False)
    cols = cols.str.replace(' ', '_', regex=False)
    cols = cols.str.replace('-', '_', regex=False)
    df.columns = cols
    return df

df = standardize_columns(df)
print("Standardized columns:", df.columns.tolist())


# ----- DEFINE FEATURES AND TARGETS -----
target_cols = [
    'porosity_pct', 'density_g_cm3', 'thermal_conductivity_w_mk',
    'oxidation_mass_loss_pct', 'oxidation_penetration_mm', 'hot_mor_mpa',
    'slag_contact_angle_deg', 'residual_strength_pct_after_shock'
]
id_cols = ['sample_id', 'setting', 'dominant_carbon_source']
feature_cols = [c for c in df.columns if c not in target_cols + id_cols]
print(f"\nModel will be trained on these {len(feature_cols)} features:\n", feature_cols)

# ----- STRATIFIED SPLIT -----
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=True, stratify=df['dominant_carbon_source'])
X_train = train_df[feature_cols + ['dominant_carbon_source']].copy()
y_train = train_df[target_cols].copy()
X_test = test_df[feature_cols + ['dominant_carbon_source']].copy()
y_test = test_df[target_cols].copy()
test_df.to_csv(os.path.join(OUT_DIR, "test_set_for_verification.csv"), index=False)

# ----- PREPROCESSING -----
numeric_features = [c for c in feature_cols if df[c].dtype in ['float64', 'int64']]
categorical_features = ['dominant_carbon_source']
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])

# ----- MODEL TRAINING -----
rf_pipe = Pipeline([('pre', preprocessor), ('model', RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1))])
param_dist = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
rs = RandomizedSearchCV(rf_pipe, param_distributions=param_dist, n_iter=10, cv=3, scoring='r2', random_state=RANDOM_STATE, n_jobs=-1, verbose=1)
print("\nRunning RandomizedSearchCV...")
rs.fit(X_train, y_train)
best_model = rs.best_estimator_
joblib.dump(best_model, os.path.join(OUT_DIR, "best_model.joblib"))

# ----- PREDICT ON TEST SET & METRICS -----
y_pred = pd.DataFrame(best_model.predict(X_test), index=y_test.index, columns=y_test.columns)
metrics = {}
for col in y_test.columns:
    metrics[col] = {'r2': r2_score(y_test[col], y_pred[col]), 'rmse': np.sqrt(mean_squared_error(y_test[col], y_pred[col]))}
metrics_df = pd.DataFrame(metrics).T
metrics_df.to_csv(os.path.join(OUT_DIR, "test_metrics_per_target.csv"))
print("\nTest Metrics for the New Model:\n", metrics_df)

# ----- PERMUTATION IMPORTANCE -----
perm = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
imp_ser = pd.Series(perm.importances_mean, index=X_test.columns).sort_values(ascending=False)
imp_ser.to_csv(os.path.join(OUT_DIR, "permutation_importances.csv"))

# ----- PLOTS -----
plt.figure(figsize=(10, 8))
top20 = imp_ser.head(20)
sns.barplot(x=top20.values, y=top20.index)
plt.title("Top 20 Most Important Features (New Model)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "feature_importances_top20.png"))
plt.close()


### --- ADDED SECTION: Diagnostic Plots --- ###
print("\nGenerating and organizing diagnostic plots...")

# 1. Residuals vs. Predicted Plots
residual_output_dir = os.path.join(OUT_DIR, "residual_plots")
os.makedirs(residual_output_dir, exist_ok=True)
print(f"Saving residual plots to: {residual_output_dir}")
for i, col in enumerate(y_test.columns):
    residuals = y_test[col] - y_pred[col]
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred[col], residuals, alpha=0.6)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals (Actual - Predicted)")
    plt.title(f"Residuals vs. Predicted Plot for {col}")
    plt.grid(True)
    plt.tight_layout()
    plot_filename = os.path.join(residual_output_dir, f"residuals_vs_predicted_{col}.png")
    plt.savefig(plot_filename)
    plt.close()

# 2. Learning Curve Plots
lc_output_dir = os.path.join(OUT_DIR, "learning_curves")
os.makedirs(lc_output_dir, exist_ok=True)
print(f"Saving learning curve plots to: {lc_output_dir}")
scoring_metrics = {'R-squared': 'r2', 'Mean Squared Error': 'neg_mean_squared_error'}
for target_col in y_train.columns:
    print(f"Generating learning curve for: {target_col}...")
    fig, axes = plt.subplots(1, len(scoring_metrics), figsize=(12, 5), sharey=False)
    fig.suptitle(f'Learning Curves for {target_col}', fontsize=16)
    for i, (metric_name, metric_scorer) in enumerate(scoring_metrics.items()):
        train_sizes, train_scores, validation_scores = learning_curve(
            estimator=best_model, X=X_train, y=y_train[target_col],
            train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring=metric_scorer, n_jobs=-1)
        if "neg_" in metric_scorer:
            train_scores, validation_scores = -train_scores, -validation_scores
        train_scores_mean, validation_scores_mean = np.mean(train_scores, axis=1), np.mean(validation_scores, axis=1)
        ax = axes[i]
        ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
        ax.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")
        ax.set_title(metric_name)
        ax.set_xlabel("Training Set Size")
        ax.set_ylabel(metric_name)
        ax.legend(loc="best")
        ax.grid(True)
    plot_filename = os.path.join(lc_output_dir, f"lc_{target_col}.png")
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(plot_filename)
    plt.close()


print(f"\n✅ All outputs, plots and the model are saved in the '{OUT_DIR}' folder.")

Standardized columns: ['sample_id', 'setting', 'total_carbon_wt_pct', 'graphite_wt_pct', 'carbon_black_wt_pct', 'resin_wt_pct', 'pitch_wt_pct', 'graphene_wt_pct', 'cnt_wt_pct', 'gnp_wt_pct', 'antioxidant_wt_pct', 'mgo_purity_pct', 'd50_micron', 'porosity_pct', 'density_g_cm3', 'thermal_conductivity_w_mk', 'oxidation_mass_loss_pct', 'oxidation_penetration_mm', 'hot_mor_mpa', 'slag_contact_angle_deg', 'residual_strength_pct_after_shock', 'dominant_carbon_source']

Model will be trained on these 11 features:
 ['total_carbon_wt_pct', 'graphite_wt_pct', 'carbon_black_wt_pct', 'resin_wt_pct', 'pitch_wt_pct', 'graphene_wt_pct', 'cnt_wt_pct', 'gnp_wt_pct', 'antioxidant_wt_pct', 'mgo_purity_pct', 'd50_micron']

Running RandomizedSearchCV...
Fitting 3 folds for each of 10 candidates, totalling 30 fits

Test Metrics for the New Model:
                                          r2       rmse
porosity_pct                       0.310290   1.542902
density_g_cm3                      0.232537   0.06341