# Insurance Claims Charges Prediction

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Importing dataset

In [None]:
df = pd.read_csv('insurance.csv')
df.sample(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['smoker'].value_counts()

## EDA

In [None]:
for column in df.columns[0:-1]:
    plt.figure(figsize=(12,8))
    sns.barplot(x=(column), y='charges',data=df)

In [None]:
df.hist(figsize=(10,10))

In [None]:
corr = df.corr()
features = corr.index
plt.figure(figsize=(10,10))
ax = sns.heatmap(df[features].corr(),annot=True,cmap='RdYlGn')

## Replacing categorical to numerical

In [None]:
df['sex'] = df['sex'].replace({'female' : 0, 'male' : 1})

In [None]:
df['smoker'] = df['smoker'].replace({'no' : 0, 'yes' : 1})

In [None]:
df=df.drop('region', axis=1)

In [None]:
df

In [None]:
df['sex'].value_counts()

## Train Test Split

In [None]:
y = df['charges']
X = df.drop(['charges'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
print("X_train shape:",X_train.shape)
print("y_train shape: ",y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_train_scaled

In [None]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
X_test_scaled

## Model Training and Evaluation

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
from math import sqrt

In [None]:
model1=LinearRegression()
model1.fit(X_train_scaled, y_train)
print("Training accuracy: ", model1.score(X_train_scaled, y_train))

y_pred = model1.predict(X_test_scaled)

MSE1 = mean_squared_error(y_test, y_pred))
print("Mean Squared Error: ", MSE1)

RMSE1 = sqrt(MSE1)
print("Root Mean Squared Error: ", RMSE1) #RMSE = sqrt(mean_squared_error(y_test, y_pred))  

R2_1 = r2_score(y_test, y_pred)
print("R2: ", R2_1)

### Support Vector Machine

In [None]:
from sklearn.svm import SVR

model2 = SVR()
model2.fit(X_train_scaled, y_train)
print("Training accuracy: ",model2.score(X_train_scaled, y_train))

y_predict = model2.predict(X_test_scaled)

MSE2 = mean_squared_error(y_test, y_predict)
print("Mean Squared Error: ", MSE2)

RMSE2 = sqrt(MSE2)
print("Root Mean Squared Error: ", RMSE2)

R2_2 = r2_score(y_test, y_predict)
print('R2 :', R2_2)

### Decision Tree

In [None]:
from sklearn.ensemble import DecisionTreeRegressor

model3 = DecisionTreeRegressor()
model3.fit(X_train_Scaled, y_train)
print("Training accuracy: ", model3.score(X_train_scaled, y_train))

y_predic = model3.predict(X_test_scaled)

MSE3 = mean_squared_error(y_test, y_predic)
print ("Mean Squared Error: ", MSE3)

RMSE3 = sqrt(MSE3)
print("Root Mean Squared Error: ", RMSE3))

R2_3 = r2_score(y_test, y_predic)
print("R2 :", R2_3)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model4 = RandomForestRegressor()
model4.fit(X_train_Scaled, y_train)
print("Training accuracy: ", model4.score(X_train_scaled, y_train))

y_prediction = model4.predict(X_test_scaled)

MSE4 = mean_squared_error(y_test, y_prediction)
print ("Mean Squared Error: ", MSE4)

RMSE4 = sqrt(MSE3)
print("Root Mean Squared Error: ", RMSE4))

R2_4 = r2_score(y_test, y_prediction)
print("R2 :", R2_4)