#### **Modeling Interfacial Energy Transport in Dispersed Solid-Liquid Nanofluids - A Machine Learning Approach**

#### **Purpose:** This project develops a data-driven surrogate model to predict convective heat transfer enhancement in multiphase systems. By leveraging features like $Re$ and $Pr$, the study analyzes how the dispersed solid phase interacts with the continuous liquid phase to optimize energy transport across the solid-liquid interface.

##### **Author:** Bello Oluwatobi

##### **Last Updated:** January 14, 2026

### #1 Importing Libraries

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [None]:
# setting style for plots
plt.style.use('seaborn-v0_8-whitegrid')

### #2 Loading and Cleaning the Dataset

In [None]:
# loading the dataset
df = pd.read_csv('../dataset/nanofluid_dataset.csv')

In [None]:
# checking the first rows of the dataset
display(df.head())

In [None]:
# checking info on the dataset columns
print(df.info())

In [None]:
# checking the summary statistics of the dataset
df.describe()

In [None]:
# inspecting dataset for missing values
df.isna().sum()

### #3 Feature Engineering

In [None]:
# assuming a pipe diameter of 0.01 meters (10 mm)
D = 0.01

# calculating the Reynolds number and adding it as a column to the dataset
df['Reynolds_Number'] = (df['Density (kg/m³)'] * df['Flow_Velocity (m/s)'] * D) / df['Viscosity (Pa·s)']

# calculating the Prandtl number and adding it as a column to the dataset
df['Prandtl_Number'] = (df['Specific_Heat_Capacity (J/kgK)'] * df['Viscosity (Pa·s)']) / df['Thermal_Conductivity (W/mK)']

### #4 Data Exploration

In [None]:
# selecting only the numeric columns for correlation 
numeric_df = df.select_dtypes(include=[np.number])

# plotting the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of System Properties")
plt.show()

In [None]:
# plotting the scatterplot of heat transfer coefficient and volume fraction for the nanoparticle types
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Volume_Fraction (%)', y='Heat_Transfer_Coefficient (W/m²K)', hue='Nanoparticle_Type', alpha=0.6)
plt.title("Effect of Nanoparticle Concentration on Heat Transfer Coefficient")
plt.xlabel("Volume Fraction (%)")
plt.ylabel("HTC (W/m²K)")
plt.legend(title='Particle Type')
plt.show()

### #5 Data Preprocessing

In [None]:
# converting the categorical columns to numeric type
df_final = pd.get_dummies(df, columns=['Nanoparticle_Type', 'Base_Fluid'], drop_first=True)

In [None]:
# defining the inputs (X) and target variables (y)
X = df_final.drop('Heat_Transfer_Coefficient (W/m²K)', axis=1)
y = df_final['Heat_Transfer_Coefficient (W/m²K)']

In [None]:
# splitting the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

### #6 Model training and evaluation

In [None]:
# initializing and training the RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# evaluating the model performance
rf_preds = rf_model.predict(X_test)

rf_r2 = r2_score(y_test, rf_preds)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))

print(f"Random Forest Results:")
print(f"R2 Score: {rf_r2:.4f}")
print(f"RMSE: {rf_rmse:.4f}")

In [None]:
# initializing and training the XGBRegressor 
xgb_model = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=-1, random_state=42)
xgb_model.fit(X_train, y_train)

In [None]:
# evaluating the model performance
xgb_preds = xgb_model.predict(X_test)

xgb_r2 = r2_score(y_test, xgb_preds)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))

print(f"XGBoost Results:")
print(f"R2 Score: {xgb_r2:.4f}")
print(f"RMSE: {xgb_rmse:.4f}")

### #7 Model performance comparison

In [None]:
# plotting the models r2 scores
models = ['Random Forest', 'XGBoost']
r2_scores = [rf_r2, xgb_r2]

plt.figure(figsize=(8, 5))
plt.bar(models, r2_scores, color=['skyblue', 'orange'])
plt.ylim(0.8, 1.0) # Zoom in to see the difference
plt.title("Model Performance Comparison (R2 Score)")
plt.ylabel("R2 Score (Higher is Better)")
plt.show()

### #8 Feature importance

In [None]:
# plotting the feature importance of the dataset features
plt.figure(figsize=(10, 6))
feat_importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("Feature Importance Plot (Top 10 Features)")
plt.xlabel("Relative Importance")
plt.show()

### #9 Saving the model

In [None]:
# saving xgboost model
xgb_model.save_model("../model_assets/htc_pred_xgb_model.json")