In [1]:
import sys
import warnings

warnings.filterwarnings("ignore")
# Set Code to access other codes

# Path to code files
sys.path.append(r'./main_codes')


In [2]:
# Import required library
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import KFold
from data_processing import pre_process_data, pre_process_data_no_split
from regression_model import train_evaluate_models, regressors, param_grids

# Read Data

In [3]:
# Read data
data = pd.read_excel('./data/Formatted_Data_v2.xlsx').drop(columns=['Width1', 
                                                                    'Width2', 
                                                                    'Diameter',
                                                                    'Height',
                                                                    'CAD_Volume'])#.fillna(-1)

In [None]:
# NEcessary Variables
data.columns

In [5]:
# Data Preprocessing
# Input Data
X = data.drop(columns=[
                         'Sample_Apparent_Density', 
                         'Sample_Relative_Density', 
                         'Sample_Shrinkage'
                    ])

# Output Data
y_rel_den = data['Sample_Relative_Density']
y_sam_shrink = data['Sample_Shrinkage']

In [None]:
X.columns

In [7]:
# Feature List
columns_to_use = ['Actual_Volume', 
                  'Sample_Size', 
                  'Sample_Saturation', 
                  'Sample_Delay',
                  'Sample_Prints', 
                  'Sample_Geometry']

categorical_features    = [
                            'Sample_Size', 
                            'Sample_Prints', 
                            'Sample_Geometry'
                           ]

numerical_features      = [
                            'Actual_Volume',
                            'Sample_Saturation', 
                            'Sample_Delay',
                            ]

In [None]:
data.head(5)

# Data Pre-processing

In [9]:
reordered_features = numerical_features + categorical_features

# Relative Density [y_rel_den]

In [10]:
X_train_transformed, y_train, preprocessor_pipeline = pre_process_data_no_split( numerical_features = numerical_features,
                                                                                 categorical_features = categorical_features,
                                                                                 X = X,
                                                                                 y = y_rel_den,
                                                                                 columns_to_use = reordered_features)

In [None]:
preprocessor_pipeline.transformers_

In [None]:
# Access the fitted OrdinalEncoder
ordinal_encoder = preprocessor_pipeline.named_transformers_['categorical']

# Get the categories for each feature
category_mapping = ordinal_encoder.categories_

# Display the mapping
for feature, categories in zip(categorical_features, category_mapping):
    print(f"{feature}: {list(enumerate(categories))}")

# Define Model Training Function

In [13]:
def perform_cross_validate(X, y, numerical_features, categorical_features, reordered_features, filename, random_state = 420, folds = 5, shuffle = False):
    # Transform the Train Test Data
    X_train_transformed, y_train, preprocessor_pipeline = pre_process_data_no_split(numerical_features = numerical_features,
                                                                                    categorical_features = categorical_features,
                                                                                    X = X,
                                                                                    y = y,
                                                                                    columns_to_use = reordered_features)
    
    # THe plan is the check the performance on KFOLD Data
    kfold = KFold(n_splits=folds, shuffle=shuffle, random_state = random_state)

    # Initialize lists to store model results
    model_results_rel_den = pd.DataFrame()


    for fold, (train_idx, val_idx) in tqdm(enumerate(kfold.split(X_train_transformed)), desc = 'Fold'):
        # Split the data for this fold using list comprehension
        # X_train_transformed, y_train <-- Pandas Data Frame
        X_fold_train, X_fold_val = X_train_transformed.iloc[train_idx], X_train_transformed.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        # Train and evaluate all models
        results_rel_den, _  = train_evaluate_models(regressors, 
                                                    param_grids,
                                                    X_fold_train, 
                                                    y_fold_train, 
                                                    X_fold_val, 
                                                    y_fold_val)
        # Append the results to the main list
        model_results_rel_den = pd.concat([model_results_rel_den, results_rel_den], axis = 1)

    # Assuming model_results_rel_den is already defined
    average_mse_df = pd.DataFrame(model_results_rel_den['MSE'].T.mean(), columns=['Average_MSE']).reset_index(drop=True)
    average_std_df = pd.DataFrame(model_results_rel_den['MSE'].T.std(), columns=['std']).reset_index(drop=True)
    
    # Concatenate the original DataFrame with the new one
    result = pd.concat([
                        model_results_rel_den.iloc[:, 0:1], 
                        average_mse_df,
                        average_std_df
                        ], 
                        axis=1)
    
    # Get current date and time
    now = datetime.now()
    date_str = now.strftime("%B_%d_%Y_%H_%M")

    # Save Data
    result.to_csv(f'./results/model_average_performance_{filename}_{date_str}.csv')

    # Print the Table
    # result.sort_values(by='Average_MSE')

    return result

In [None]:
result_rel_den = perform_cross_validate(X, 
                                        y_rel_den, 
                                        numerical_features, 
                                        categorical_features, 
                                        reordered_features, 
                                        filename = 'relative_density', 
                                        random_state = 400, # None when 'shuffle' is 'False'
                                        shuffle = True, # 200
                                        folds = 5)

In [None]:
result_rel_den.sort_values(by='Average_MSE')

In [None]:
result_sam_shrink = perform_cross_validate(X, 
                                           y_sam_shrink, 
                                           numerical_features, 
                                           categorical_features, 
                                           reordered_features, 
                                           filename = 'sample_shrinkage', 
                                           random_state = 420, 
                                           shuffle = True,
                                           folds=5)

In [None]:
result_sam_shrink.sort_values(by='Average_MSE')

# Model Explaination

### Model Explanation for Relative Density

In [15]:
X_train_transformed_rel_den, X_test_transformed_rel_den, y_train_rel_den, y_test_rel_den, preprocessor_pipeline = pre_process_data( numerical_features, 
                                                                                                                                    categorical_features, 
                                                                                                                                    X, y_rel_den, 
                                                                                                                                    columns_to_use, 
                                                                                                                                    random_state = 420, 
                                                                                                                                    test_size = 0.3)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import shap

# Create and train BaggingRegressor
base_estimator = DecisionTreeRegressor(max_depth=3)
model = BaggingRegressor(
    estimator=base_estimator,
    n_estimators = 100,
    max_samples = 0.5,
    max_features = 1
)
model.fit(X_train_transformed_rel_den, y_train_rel_den)

# Calculate SHAP values for a subset of data
background_data = X_train_transformed_rel_den

# Create explainer using KernelExplainer
explainer = shap.KernelExplainer(model.predict, background_data)

# Get shap values for display
display_data = X_train_transformed_rel_den  
shap_display = explainer(display_data)  # This creates an Explanation object

# Visualizations
# Summary plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_display.values, display_data)

# Feature importance bar plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_display.values, display_data, plot_type="bar")

# Heatmap of SHAP values
plt.figure(figsize=(12, 8))
shap.plots.heatmap(shap_display)

# Dependence plot for most important feature
# feature_importance = np.abs(shap_display.values).mean(0)
# most_important_feature = np.argmax(feature_importance)
# plt.figure(figsize=(10, 6))
# shap.dependence_plot(most_important_feature, shap_display.values, display_data)

# Individual prediction explanation
plt.figure(figsize=(8, 3))
shap.force_plot(
                explainer.expected_value,
                shap_display.values[0],
                display_data.iloc[0],
                matplotlib=True,
                show=False
                )
plt.tight_layout()
plt.show()

### Model Explanation for sample shrinkage

In [13]:
X_train_transformed_sam_shrink, X_test_transformed_sam_shrink, y_train_sam_shrink, y_test_sam_shrink, preprocessor_pipeline = pre_process_data( numerical_features, 
                                                                                                                                                categorical_features, 
                                                                                                                                                X, y_sam_shrink, 
                                                                                                                                                columns_to_use, 
                                                                                                                                                random_state = 420, 
                                                                                                                                                test_size = 0.3)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import shap

# Create and train LinearRegression model
model = LinearRegression()
model.fit(X_train_transformed_sam_shrink, y_train_sam_shrink)

# Create explainer - for linear models, we can use LinearExplainer
explainer = shap.LinearExplainer(model, X_train_transformed_sam_shrink)

# Get shap values for display
display_data = X_train_transformed_sam_shrink[:50]  # Using 50 samples for visualization
shap_display = explainer(display_data)

# Visualizations
# Summary plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_display.values, display_data)

# Feature importance bar plot
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_display.values, display_data, plot_type="bar")

# Heatmap of SHAP values
plt.figure(figsize=(12, 8))
shap.plots.heatmap(shap_display)

# # Dependence plot for most important feature
# feature_importance = np.abs(shap_display.values).mean(0)
# most_important_feature = np.argmax(feature_importance)
# plt.figure(figsize=(10, 6))
# shap.dependence_plot(most_important_feature, shap_display.values, display_data)

# Individual prediction explanation
plt.figure(figsize=(8, 3))
shap.force_plot(
    explainer.expected_value,
    shap_display.values[0],
    display_data.iloc[0],
    matplotlib=True,
    show=False
)
plt.tight_layout()
plt.show()

# Print feature importance summary
feature_importance_df = pd.DataFrame({
    'Feature': X_train_transformed_sam_shrink.columns,
    'Importance': np.abs(shap_display.values).mean(0)
})
print("\nFeature Importance Summary:")
print(feature_importance_df.sort_values('Importance', ascending=False))