## Importing Different Libraries in Python

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Data Collection

In [None]:
# Loading the dataset from csv file to a pandas dataframe

insurance_data = pd.read_csv('C:/Users/negip/insurance.csv')

In [None]:
# Taking first 5 rows from the dataset 

insurance_data.head()

In [None]:
# Total number of rows and column of dataset

insurance_data.shape

In [None]:
# Getting information about the dataset

insurance_data.info()

##### Categorical features in the dataset
 - sex
 - smoker
 - region
 

In [None]:
# Checking for missing values in the dataset

insurance_data.isnull().sum()

## Data Analysis

In [None]:
# statistical measurement of data

insurance_data.describe()

In [None]:
# Distribution of age value

sns.set()
plt.figure(figsize=(6,6))
sns.histplot(insurance_data['age'])
plt.title('Age Distribution')
plt.show()

In [None]:
# Counting for gender column

plt.figure(figsize=(6,6))
sns.countplot(x='sex', data=insurance_data)
plt.title('Sex Distribution')
plt.show()

In [None]:
insurance_data['sex'].value_counts()

In [None]:
# Distribution of bmi value

plt.figure(figsize=(6,6))
sns.histplot(insurance_data['bmi'])
plt.title('Bmi Distribution')
plt.show()

##### Normal range for BMI is 18.5 to 24.9

- If your BMI is less than 18.5, it falls within the underweight range
- If your BMI is 25.0 to 29.9, it falls within the overweight range
- If your BMI is 30.0 or higher, it falls within the obese range

In [None]:
# Counting for children column

plt.figure(figsize=(6,6))
sns.countplot(x='children', data=insurance_data)
plt.title('Children')
plt.show()

In [None]:
insurance_data['children'].value_counts()

In [None]:
# Counting for smoker column

plt.figure(figsize=(6,6))
sns.countplot(x='smoker', data=insurance_data)
plt.title('Smokers')
plt.show()

In [None]:
insurance_data['smoker'].value_counts()

In [None]:
# Counting for region column

plt.figure(figsize=(6,6))
sns.countplot(x='region', data=insurance_data)
plt.title('Regions')
plt.show()

In [None]:
insurance_data['region'].value_counts()

In [None]:
# Distribution of charges value

plt.figure(figsize=(6,6))
sns.histplot(insurance_data['charges'])
plt.title('Charge Distribution')
plt.show()

## Data Pre-Processing

##### Encoding the Categorical features

In [None]:
# Encoding sex column

insurance_data.replace({'sex':{'male':0 , 'female':1}} , inplace=True)

# Encoding smoker column

insurance_data.replace({'smoker':{'yes':0 , 'no':1}} , inplace=True)

# Encoding region column

insurance_data.replace({'region':{'southeast':0 , 'southwest':1 , 'northeast':2 , 'northwest':3}} , inplace=True)

##### Splitting the features and target

In [None]:
X = insurance_data.drop(columns='charges' , axis=1)
Y = insurance_data['charges']

In [None]:
print(X)

In [None]:
print(Y)

##### Splitting the data into training data and testing data

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2 , random_state=2 )

In [None]:
# 80% of the data is in the training data and 20% for the testing data

print(X.shape , X_train.shape , X_test.shape)

##  Model Training

##### Linear Regression

In [None]:
# Loading the linear regression model

regressor = LinearRegression()

In [None]:
regressor.fit(X_train , Y_train)

##### Model Evaluation

In [None]:
# prediction on training data

training_data_prediction = regressor.predict(X_train)

In [None]:
# R sqaured value 

r2_train = metrics.r2_score(Y_train , training_data_prediction)
print("R Squared Value : ", r2_train)

In [None]:
# prediction on test data

testing_data_prediction = regressor.predict(X_test)

In [None]:
# R sqaured value 

r2_test = metrics.r2_score(Y_test , testing_data_prediction)
print("R Squared Value : ", r2_test)

## Building a Predictive System

In [None]:
# taking input from user

age = int(input("Enter your age : "))
sex = int(input("Enter 0 if male and 1 if female : "))
bmi = float(input("Enter your Body Mass Index : "))
children = int(input("Enter the number of childrens : "))
smoker = int(input("Enter 0 if you smoke and 1 if you do not smoke : "))
region = int(input("Enter 0 if southeast , 1 if southwest , 2 if northeast, 3 if southwest : "))
 
input_data = (age,sex,bmi,children,smoker,region)

input_data_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_numpy_array.reshape(1,-1)

prediction = regressor.predict(input_data_reshaped)

print("The Cost of the insurance is",prediction)