**Step 1: Data Collection**

In [4]:
#Import necessary libraries
import numpy as np
import pandas as pd

In [5]:
#Load the dataset
data=pd.read_csv("/content/insurance.csv")

In [6]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


**Step 2: Data Cleaning**

In [7]:
#convert categorical variables into numerical format - One Hot Encoding
data['sex']=data['sex'].apply({'female':1, 'male':0}.get)

In [8]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,1,27.9,0,yes,southwest,16884.92
1,18,0,33.8,1,no,southeast,1725.55
2,28,0,33.0,3,no,southeast,4449.46
3,33,0,22.7,0,no,northwest,21984.47
4,32,0,28.9,0,no,northwest,3866.86


In [9]:
data['smoker']=data['smoker'].apply({'yes':1, 'no':0}.get)
data['region']=data['region'].apply({'southeast':1,'southwest':2,'northeast':3,'northwest':4}.get)

In [10]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,1,27.9,0,1,2,16884.92
1,18,0,33.8,1,0,1,1725.55
2,28,0,33.0,3,0,1,4449.46
3,33,0,22.7,0,0,4,21984.47
4,32,0,28.9,0,0,4,3866.86


**Step 3 : Dividing the data into dependent and independent variables**

In [11]:
x=data[['age','sex','bmi','children','smoker','region']] #independent variables
y=data['expenses'] #dependent variable

**Step 4 : Splitting the data into training and testing set**

In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,train_size=0.8)#80% data used for training(learning)

**Creating Simple Machine Learning Model using Linear Regression algorithm**

In [13]:
from sklearn.linear_model import LinearRegression
#Initialize the Linear Regression model
regression=LinearRegression()

In [14]:
#Train the model
regression.fit(x_train,y_train)

In [15]:
#Make predictions on test data
y_pred=regression.predict(x_test)

In [16]:
#Display predictions
print(y_pred)

[ 7732.99990166 36302.65106183 11180.50012253 12496.93125547
 11430.33177378  3960.78880405 35288.86773107  5261.12247742
  9861.10810508 35251.98376403 38545.9372252  15976.62924329
 13279.98457346  2002.92477974 11317.89516449 11633.76018217
 17577.61280429  4864.05832841 12350.46798241  5008.01065854
 13900.45486127 11760.22118273  5054.23850274  4561.95716998
  8129.81728239 10195.73588499  2941.16753518  9719.60363108
  2137.80513671  6857.13725974 12667.36594729  2646.71082246
  1853.8679259  34801.89973922  9155.84862113 15489.35194793
  7161.58782748 12085.34230532 15333.21105588 12278.37604087
  4625.07558519 10147.23752728  7181.14653934 11744.92022437
  9849.1669051  39623.81974751 38252.85233592  7778.96152784
 25807.86728364 10158.58250946  8511.62844582 11843.60185089
  4278.20353935 13117.34209724  3919.96068064  2171.15783212
  9507.15895355  1996.96201255  2314.60715557 13466.34576317
 15281.68553637 29511.90591063 16017.05335841 25624.91572693
 31561.1144753   7660.45

In [17]:
# Evaluating the model's accuracy using R^2 score
regression.score(x,y)

0.7502662599878964

**Adding new customer data to predict insurance eligibility**

In [18]:
new_data={'age':65,'sex':0,'bmi':25,'children':2,'smoker':1,'region':3}
index=[1] #serial number

In [19]:
# Convert the new data into a DataFrame
my_data=pd.DataFrame(new_data,index)

In [20]:
my_data

Unnamed: 0,age,sex,bmi,children,smoker,region
1,65,0,25,2,1,3


In [21]:
# Predict the insurance expenses for the new customer
new_predictions=regression.predict(my_data)
print("The estimated insurance cost for the new customer is:",new_predictions)

The estimated insurance cost for the new customer is: [37677.46542949]
