In [79]:
import pandas as pd
import numpy as np

**Importing dataset**

In [80]:
df=pd.read_csv("/content/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [81]:
df.shape

(1338, 7)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Categorical features:
- sex
- smoker
- region

In [83]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


There are no null values in the data

In [84]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


converting the categorical data into integers

In [88]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552
5,31,0,25.74,0,0,1,3756.6216
6,46,0,33.44,1,0,1,8240.5896
7,37,0,27.74,3,0,2,7281.5056
8,37,1,29.83,2,0,3,6406.4107
9,60,0,25.84,0,0,2,28923.13692


Separating Features and target

In [89]:
X=df.drop(columns='charges',axis=1)
Y=df['charges']
print(X)

      age  sex     bmi  children  smoker  region
0      19    0  27.900         0       1       0
1      18    1  33.770         1       0       1
2      28    1  33.000         3       0       1
3      33    1  22.705         0       0       2
4      32    1  28.880         0       0       2
...   ...  ...     ...       ...     ...     ...
1333   50    1  30.970         3       0       2
1334   18    0  31.920         0       0       3
1335   18    0  36.850         0       0       1
1336   21    0  25.800         0       0       0
1337   61    0  29.070         0       1       2

[1338 rows x 6 columns]


In [90]:
df.to_csv("processed_insurance_dataset.csv", index=False)

FEATURE SELECTION

In [105]:
bmi = df['bmi'].values
y = df['charges'].values


In [106]:
X = np.c_[bmi, bmi**2]


In [107]:
indices = np.arange(len(X))
np.random.shuffle(indices)

X = X[indices]
y = y[indices]


In [108]:
split = int(0.8 * len(X))

X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


In [109]:
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)

X_train = (X_train - X_mean) / X_std
X_test = (X_test - X_mean) / X_std


In [110]:
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]


In [111]:
y_mean = y_train.mean()
y_std = y_train.std()

y_train_scaled = (y_train - y_mean) / y_std


In [112]:
weights = np.zeros(X_train.shape[1])
learning_rate = 0.01
epochs = 6000
n = len(y_train_scaled)


In [113]:
for i in range(epochs):
    y_pred = X_train.dot(weights)
    error = y_pred - y_train_scaled

    gradients = (2 / n) * X_train.T.dot(error)
    weights -= learning_rate * gradients

    if i % 1000 == 0:
        loss = np.mean(error ** 2)
        print(f"Epoch {i}, Training MSE: {loss:.4f}")


Epoch 0, Training MSE: 1.0000
Epoch 1000, Training MSE: 0.9572
Epoch 2000, Training MSE: 0.9570
Epoch 3000, Training MSE: 0.9569
Epoch 4000, Training MSE: 0.9569
Epoch 5000, Training MSE: 0.9568


In [114]:
y_test_pred_scaled = X_test.dot(weights)
y_test_pred = y_test_pred_scaled * y_std + y_mean


In [115]:
mse = np.mean((y_test - y_test_pred) ** 2)
rmse = np.sqrt(mse)

ss_total = np.sum((y_test - np.mean(y_test)) ** 2)
ss_res = np.sum((y_test - y_test_pred) ** 2)
r2 = 1 - (ss_res / ss_total)

print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


MSE: 120107767.17436293
RMSE: 10959.36892226751
R2 Score: 0.014578497570458593


In [116]:
print("\nPolynomial Model Coefficients")
print("Bias:", weights[0])
print("BMI:", weights[1])
print("BMI^2:", weights[2])



Polynomial Model Coefficients
Bias: -7.7083095877208e-16
BMI: 0.22783381933166036
BMI^2: -0.020841118050378922


In [119]:
def predict_insurance_poly(bmi):
    # Create polynomial feature
    X_new = np.array([[bmi, bmi**2]])

    # Normalize using training stats
    X_new = (X_new - X_mean) / X_std

    # Add bias
    X_new = np.c_[np.ones(X_new.shape[0]), X_new]

    # Predict (scaled)
    y_scaled = X_new.dot(weights)

    # Convert back to original scale
    y_pred = y_scaled * y_std + y_mean

    return y_pred[0]


In [120]:
print("BMI 22 → Predicted Charges:", predict_insurance_poly(22))
print("BMI 27 → Predicted Charges:", predict_insurance_poly(27))
print("BMI 32 → Predicted Charges:", predict_insurance_poly(32))


BMI 22 → Predicted Charges: 9689.852129879611
BMI 27 → Predicted Charges: 11854.573464570096
BMI 32 → Predicted Charges: 13985.716057907337


In [121]:
bmi_values = [20, 25, 30, 35]

for b in bmi_values:
    print(f"BMI {b} → Charges {predict_insurance_poly(b):.2f}")


BMI 20 → Charges 8814.56
BMI 25 → Charges 10992.71
BMI 30 → Charges 13137.29
BMI 35 → Charges 15248.28
