In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from minires import minires

In [2]:
DATA_PATH = Path("3d_print_miniatures_data.csv")
ERROR_MARGINS = [1, 2, 3, 5]

In [3]:
def percent_within_margin(y_true, y_pred, margin):
    err = np.abs(y_true - y_pred)
    return (err <= margin).mean() * 100.0

def summarize_results(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n=== {name} ===")
    print(f"MAE:  {mae:.4f} g")
    print(f"RMSE: {rmse:.4f} g")
    print(f"R²:   {r2:.6f}")

    for m in ERROR_MARGINS:
        pct = percent_within_margin(y_true, y_pred, m)
        print(f"Within ±{m} g: {pct:.3f}%")

    err = y_pred - y_true
    print(f"Mean error (bias): {err.mean():.4f} g")
    print(f"Median abs error:  {np.median(np.abs(err)):.4f} g")

In [4]:
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Data file not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)

In [5]:
def is_nan_columns(df):
    nan_rows = None
    if df.isnull().values.any():
        print("Dataset contains NaN values.")
    
        nan_counts = df.isnull().sum()
        print("NaN counts in each column:")
        print(nan_counts[nan_counts > 0])

        print()
        
        nan_rows = df[df.isnull().any(axis=1)]

    return nan_rows

df_nans = is_nan_columns(df)

Dataset contains NaN values.
NaN counts in each column:
weight        1
base_mm    1286
dtype: int64



In [6]:
df.dropna(subset=['weight'], inplace=True)
df["base_mm"] = df["base_mm"].fillna(0)

In [7]:
def remove_outliers(df, features, q1=0.25, q3=0.85, factor=1.5):
    if isinstance(features, str):
        features = [features]

    cols = df[features]

    Q1 = cols.quantile(q1)
    Q3 = cols.quantile(q3)
    IQR = Q3 - Q1

    upper_bounds = Q3 + factor * IQR

    outlier_flags = cols > upper_bounds
    row_outlier = outlier_flags.any(axis=1)

    mask = ~row_outlier

    print("Upper bounds:")
    print(upper_bounds)
    print(f"→ keeping {mask.sum()} / {len(df)} rows")

    return df[mask]

df = remove_outliers(df, features=['volume'])

Upper bounds:
volume    72100.325
dtype: float64
→ keeping 12479 / 12887 rows


In [8]:
df

Unnamed: 0,artist,mini,kb,volume,surface_area,bbox_x,bbox_y,bbox_z,bbox_area,mass,...,scale,weight,base_mm,volume_g,mass_g,volume_mass_interaction,surface_volume_ratio,bbox_volume_ratio,surface_mass_ratio,bbox_mass_ratio
0,1,1566,57773,26671.5,62721.738592,51.132734,63.831800,63.733878,208020.653476,26671.484162,...,103.687330,26.895716,50.0,29.33865,29.338633,860.755873,2.351639,7.799361,2.351640,7.799366
1,0,2114,3795,279.4,1028.552066,12.985000,10.738600,12.474300,1739.425358,279.380305,...,20.965112,0.241786,50.0,0.30734,0.307318,0.094451,3.681289,6.225574,3.681548,6.226013
2,0,2115,20591,12494.2,27990.725314,60.528799,23.654200,47.155201,67514.944409,12494.153312,...,80.292401,11.668830,50.0,13.74362,13.743569,188.886385,2.240298,5.403703,2.240306,5.403723
3,0,10,30708,6960.5,22507.570367,47.551899,35.269100,38.413200,64423.265760,6960.442344,...,70.573837,6.165829,30.0,7.65655,7.656487,58.622272,3.233614,9.255551,3.233641,9.255628
4,1,299,26519,3515.4,8609.005976,28.039989,26.600918,26.896539,20061.844546,3515.383904,...,47.087935,3.588529,25.0,3.86694,3.866922,14.953156,2.448941,5.706845,2.448952,5.706872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12883,0,1889,2967,406.0,1552.369311,12.480867,14.504034,22.503483,4073.646173,405.918764,...,29.538886,0.346615,30.0,0.44660,0.446511,0.199412,3.823570,10.033611,3.824335,10.035619
12884,0,184,40962,27699.4,65140.971405,52.433762,65.200005,78.152955,267180.061131,27699.385660,...,114.491154,26.727066,15.0,30.46934,30.469324,928.380199,2.351711,9.645699,2.351712,9.645703
12885,1,1007,95898,55019.5,124400.223716,74.497997,65.570930,78.978145,385800.575224,55019.459563,...,126.834719,55.388030,50.0,60.52145,60.521406,3662.843218,2.261021,7.012070,2.261022,7.012075
12886,1,1547,103978,9062.4,26305.016447,42.100874,48.573549,41.214568,84283.333096,9062.387362,...,76.357802,8.991272,25.0,9.96864,9.968626,99.373645,2.902655,9.300332,2.902659,9.300345


In [14]:
from minires import minires

model = minires()

minires.keras:   0%|          | 0.00/9.74M [00:00<?, ?B/s]

minires.joblib:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

minires.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

In [15]:
X = df.copy()
y = df["weight"].to_numpy(dtype=float)
y_pred = model.predict(X)

In [16]:
mask = ~np.isnan(y_pred)
if mask.sum() < len(y_pred):
    print(f"Dropped {len(y_pred) - mask.sum()} rows with NaN predictions.")
    y_pred = y_pred[mask]
    y = y[mask]

summarize_results("minires", y, y_pred)


=== minires ===
MAE:  0.2914 g
RMSE: 1.1020 g
R²:   0.995539
Within ±1 g: 94.751%
Within ±2 g: 98.373%
Within ±3 g: 99.030%
Within ±5 g: 99.591%
Mean error (bias): 0.0064 g
Median abs error:  0.1016 g


In [17]:
preview = df.loc[mask].copy()
preview["pred_weight"] = y_pred
preview["error"] = preview["pred_weight"] - preview["weight"]
preview["abs_error"] = preview["error"].abs()

print("\nSample rows:")
print(preview[["weight", "pred_weight", "error", "abs_error"]].head(10))


Sample rows:
      weight  pred_weight     error  abs_error
0  26.895716    26.447308 -0.448408   0.448408
1   0.241786     0.230499 -0.011287   0.011287
2  11.668830    11.774111  0.105281   0.105281
3   6.165829     5.981131 -0.184698   0.184698
4   3.588529     3.508126 -0.080403   0.080403
5  44.646130    44.562271 -0.083859   0.083859
6  39.587055    39.663486  0.076431   0.076431
7   5.751509     5.382207 -0.369302   0.369302
8   3.024541     2.850452 -0.174089   0.174089
9   6.977429     7.178446  0.201016   0.201016
