In [None]:
!pip install datasets
!pip install onnx skl2onnx

Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting skl2onnx
  Downloading skl2onnx-1.18.0-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting onnxconverter-common>=1.7.0 (from skl2onnx)
  Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting protobuf>=3.20.2 (from onnx)
  Downloading protobuf-3.20.2-py2.py3-none-any.whl.metadata (720 bytes)
Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading skl2onnx-1.18.0-py2.py3-none-any.whl (300 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.3/300.3 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxconverter_common-1.14.0-py2.py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 k

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib
import onnx
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [None]:
# Load dataset from uploaded file
df = pd.read_csv('/content/medical_insurance.csv')

In [None]:
# Display first few rows
display(df.head())

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
# Preprocessing
df = df.dropna()
categorical_cols = ['sex', 'smoker', 'region']
numerical_cols = ['age', 'bmi', 'children']
X = df[categorical_cols + numerical_cols]
y = df['charges']

In [None]:
# One-hot encoding for categorical features
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(df[categorical_cols])
X_encoded = np.hstack((X_encoded, df[numerical_cols].values))


In [None]:
# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Convert to ONNX
initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)
with open("insurance_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


In [None]:
# Save encoder and scaler
joblib.dump(encoder, "encoder.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']