In [52]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
import joblib
import os

In [54]:
# Load the data
file_path = r"CGL_Model2_Data.xlsx"
data = pd.read_excel(file_path)

# Strip whitespace from column names
data.columns = data.columns.str.strip()

# Display the first few rows and data info
print(data.head())
print(data.info())


     Width  Thickness  Speed  TPH  GSM-A    JCF  JCF1  JCF2   JCF3  JCF4  ...  \
0  1236.00       2.00     29   40  313.0  29.63    50    35  32.09    33  ...   
1    29.55       3.40     25   41  282.0  29.00    50    35  32.00    33  ...   
2  2851.00       2.50     29   40  282.0  29.00    50    35  32.00    33  ...   
3  2702.00       2.00     29   32  282.0  29.00    50    36  32.00    33  ...   
4  1004.00       0.98     65   30   85.0  23.00    50    28  26.00    27  ...   

   NOF3  NOF4  NOF5  RTF1  RTF2  RTF3  JCFEN_STRIP_C  JCFEX_STRIP_C  \
0  1200  1119  1152   769   777   685            768            443   
1  1200  1104  1137   768   773   684            471            510   
2  1200  1100  1133   768   790   717            668            620   
3  1200  1116  1149   771   770   700            698            618   
4  1200  1160  1190   714   710   644            808            605   

   Pot Temperature  Hardness  
0              467      79.0  
1              472      

In [56]:
# Check for NaN values in the 'Hardness' column
print("Number of NaN values in Hardness column:", data['Hardness'].isna().sum())

# Handle NaN values in the 'Hardness' column
data['Hardness'] = data['Hardness'].fillna(data['Hardness'].mean())

# Verify that NaN values have been handled
print("Number of NaN values in Hardness column after handling:", data['Hardness'].isna().sum())

# Display summary statistics of the 'Hardness' column
print(data['Hardness'].describe())


Number of NaN values in Hardness column: 47
Number of NaN values in Hardness column after handling: 0
count    754.000000
mean      70.074965
std       11.955708
min       55.000000
25%       60.000000
50%       65.000000
75%       78.000000
max       98.000000
Name: Hardness, dtype: float64


In [60]:
# Select features and target variables
features = ['Width','Thickness','GSM-A','TPH','Hardness']
target = ['Speed','NOF1','NOF2','NOF3','NOF4','NOF5','RTF1','RTF2','RTF3','JCF','JCF1','JCF2','JCF3','JCF4','JCFEN_STRIP_C','JCFEX_STRIP_C','Pot Temperature']

X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Initialize and train the XGBoost Regressor
xgb_model = MultiOutputRegressor(XGBRegressor(n_estimators=100, random_state=42))
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test_scaled)
xgb_pred = xgb_model.predict(X_test_scaled)

# Calculate R^2 score for each target variable for both models
rf_r2_scores = {}
xgb_r2_scores = {}

for i, col in enumerate(target):
    rf_r2 = r2_score(y_test.iloc[:, i], rf_pred[:, i])
    xgb_r2 = r2_score(y_test.iloc[:, i], xgb_pred[:, i])
    
    rf_r2_scores[col] = rf_r2
    xgb_r2_scores[col] = xgb_r2
    
    print(f"R^2 score for {col}:")
    print(f"  Random Forest: {rf_r2:.4f}")
    print(f"  XGBoost: {xgb_r2:.4f}")
    print()

# Calculate the average R^2 score for both models
rf_avg_r2 = np.mean(list(rf_r2_scores.values()))
xgb_avg_r2 = np.mean(list(xgb_r2_scores.values()))

print(f"Average R^2 score:")
print(f"  Random Forest: {rf_avg_r2:.4f}")
print(f"  XGBoost: {xgb_avg_r2:.4f}")


R^2 score for Speed:
  Random Forest: 0.9752
  XGBoost: 0.9871

R^2 score for NOF1:
  Random Forest: 0.9280
  XGBoost: 0.8933

R^2 score for NOF2:
  Random Forest: 0.8952
  XGBoost: 0.8462

R^2 score for NOF3:
  Random Forest: 0.8149
  XGBoost: 0.7389

R^2 score for NOF4:
  Random Forest: 0.8677
  XGBoost: 0.8322

R^2 score for NOF5:
  Random Forest: 0.8345
  XGBoost: 0.8440

R^2 score for RTF1:
  Random Forest: 0.9111
  XGBoost: 0.9122

R^2 score for RTF2:
  Random Forest: 0.9049
  XGBoost: 0.8944

R^2 score for RTF3:
  Random Forest: 0.3042
  XGBoost: 0.3236

R^2 score for JCF:
  Random Forest: 0.7044
  XGBoost: 0.6550

R^2 score for JCF1:
  Random Forest: 1.0000
  XGBoost: 1.0000

R^2 score for JCF2:
  Random Forest: 0.8138
  XGBoost: 0.7708

R^2 score for JCF3:
  Random Forest: 0.7385
  XGBoost: 0.7082

R^2 score for JCF4:
  Random Forest: 0.7590
  XGBoost: 0.6652

R^2 score for JCFEN_STRIP_C:
  Random Forest: 0.6929
  XGBoost: 0.7161

R^2 score for JCFEX_STRIP_C:
  Random Forest: 

In [66]:
# Define the directory path
model_dir = r"C:\Users\parim\OneDrive\Desktop\JSW_CGL_Project2"

# Create the directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

# Function to save model if it doesn't exist
def save_model(model, filename):
    filepath = os.path.join(model_dir, filename)
    if not os.path.exists(filepath):
        joblib.dump(model, filepath)
        print(f"Saved {filename}")
    else:
        print(f"{filename} already exists. Skipping.")

# Save models and scaler
save_model(rf_model, "rf_model.joblib")
save_model(xgb_model, "xgb_model.joblib")
save_model(scaler, "scaler.joblib")

# Save feature names and target names
save_model(features, "features.joblib")
save_model(target, "target.joblib")

print("Model export completed.")


Saved rf_model.joblib
Saved xgb_model.joblib
Saved scaler.joblib
Saved features.joblib
Saved target.joblib
Model export completed.
