In [None]:

# pipeline_knuckle_rf.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib
import math

# ---------- 0. Config ----------
DATA_PATH = "C:\\Python\\knuckle-Joint-Dimensions-Predictor\\combined_fixed.csv"  # Change to your data file path
TEST_SIZE = 0.20
RANDOM_STATE = 42

# ---------- 1. Load data ----------
try:
    df = pd.read_excel(DATA_PATH)
except Exception as e:
    print(f"Error reading Excel file: {e}")
    try:
        df = pd.read_csv(DATA_PATH, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(DATA_PATH, encoding='latin1')
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        raise

expected_cols = ['load','fos(pin)','fos(eye)','d1','d2','d3','t','t1','t2']
missing = [c for c in expected_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in dataset: {missing}. Rename or fix your file.")

print("Rows, Columns:", df.shape)
print(df.head())

# ---------- 2. Clean / Impute ----------
print("Missing values per column:\n", df.isnull().sum())

if df[expected_cols].isnull().any().any():
    imp = SimpleImputer(strategy='median')
    df[expected_cols] = imp.fit_transform(df[expected_cols])
    print("Imputed missing values with median.")

df = df.drop_duplicates().reset_index(drop=True)

# ---------- 3. Prepare X,y ----------
cols_y = ['d1','d2','d3','t','t1','t2']
X = df[['load','fos(pin)','fos(eye)']].values
y = df[cols_y].values

# ---------- 4. Train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# ---------- 5. Train Random Forest ----------
rf = RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)

# ---------- 6. Evaluate ----------
y_pred = rf.predict(X_test)
rmse_per_output = np.sqrt(mean_squared_error(y_test, y_pred, multioutput='raw_values'))
r2_avg = r2_score(y_test, y_pred, multioutput='uniform_average')
print("RMSE per output:", dict(zip(cols_y, rmse_per_output)))
print("Average R2:", r2_avg)

# ---------- 7. Hyperparameter tuning ----------
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}
gsearch = GridSearchCV(
    estimator=RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=4,
    n_jobs=-1,
    verbose=1
)
gsearch.fit(X_train, y_train)
print("Best params:", gsearch.best_params_)
best_rf = gsearch.best_estimator_

# Evaluate best model on test set
y_pred_best = best_rf.predict(X_test)
print("Test RMSE (best model) per output:",
      np.sqrt(mean_squared_error(y_test, y_pred_best, multioutput='raw_values')))
print("Test R2 (best) avg:", r2_score(y_test, y_pred_best, multioutput='uniform_average'))

# ---------- 8. Save the trained model ----------
joblib.dump(best_rf, "rf_knuckle_joint_model.pkl")
print("Saved model to rf_knuckle_joint_model.pkl")

# ---------- 9. Prediction function ----------
def predict_dimensions(load, fos_pin, fos_eye, model=best_rf):
    """Predict dimensions based on inputs and return as dict with mm units (rounded up with ceil)."""
    arr = np.array([[load, fos_pin, fos_eye]])
    pred = model.predict(arr)[0]
    return {col: f"{math.ceil(val)} mm" for col, val in zip(cols_y, pred)}

# ---------- 10. Interactive Multiple User Inputs ----------
print("\n--- Knuckle Joint Dimension Predictor ---")
print("Type 'q' at any prompt to exit.\n")

while True:
    try:
        load_in = input("Enter Load (N): ")
        if load_in.lower() == 'q':
            print("Exiting.")
            break
        fos_pin_in = input("Enter fos(pin): ")
        if fos_pin_in.lower() == 'q':
            print("Exiting.")
            break
        fos_eye_in = input("Enter fos(eye): ")
        if fos_eye_in.lower() == 'q':
            print("Exiting.")
            break

        load = float(load_in)
        fos_pin = float(fos_pin_in)
        fos_eye = float(fos_eye_in)

        prediction = predict_dimensions(load, fos_pin, fos_eye)
        print("\nPredicted Dimensions (Ceil Rounded):")
        for k, v in prediction.items():
            print(f"{k} = {v}")
        print("-" * 40)
    except Exception as e:
        print("Error in input:", e)




Error reading Excel file: Excel file format cannot be determined, you must specify an engine manually.
Rows, Columns: (600, 9)
   load  fos(pin)  fos(eye)    d1    d2    d3     t    t1    t2
0    10       2.0       3.0  0.06  0.12  0.09  0.08  0.05  0.03
1    20       2.5       3.5  0.09  0.19  0.13  0.10  0.07  0.04
2    30       2.0       3.0  0.11  0.22  0.16  0.13  0.09  0.05
3    40       2.0       3.0  0.13  0.26  0.18  0.15  0.10  0.06
4    50       2.0       3.5  0.14  0.28  0.20  0.17  0.12  0.07
Missing values per column:
 load        0
fos(pin)    0
fos(eye)    0
d1          0
d2          0
d3          0
t           0
t1          0
t2          0
dtype: int64
RMSE per output: {'d1': np.float64(0.8816122164345654), 'd2': np.float64(1.7566972175494089), 'd3': np.float64(1.441013157570626), 't': np.float64(1.4305178363460438), 't1': np.float64(0.8495369518768251), 't2': np.float64(0.6176960796985844)}
Average R2: 0.9973115566166472
Fitting 4 folds for each of 81 candidates, tota