**1. Load the Dataset:**

In [7]:
import pandas as pd

file_path='insurance.csv'
database = pd.read_csv(file_path)

**2. Prepare Train-Validation-Test Split:**

In [41]:
from sklearn.model_selection import train_test_split
X = database.drop('charges', axis=1)
Y = database['charges']

X_val, X_test, y_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_val, y_val, test_size=0.25, random_state=42)

**3. Preprocessing the Data**

---
  After Preprocessing the data will be ordered in this order

  ['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no',
  'smoker_yes', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']

---






In [94]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']

#StandarScaler() --> Takes the numerical values and standardize them all to have a value of Mean 0 and Standared Deviation 1
numerical_transformer = StandardScaler()
#OneHotEncode()  --> Takes the Categrocal values(string) and turn them to binary number based on how many option for the column
# Example. Sex column --> have only two options 1-Male 2 Female --> then it will turn it to (10) or (01)
categorical_transformer = OneHotEncoder()

#ColumnTransofrmer is benificial when you have both numerical and categorical values . It compines them
preprocessor = ColumnTransformer(
    [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

target_preprocessor = StandardScaler()
y_train_preprocessed = target_preprocessor.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_val_preprocessed = target_preprocessor.transform(y_val.values.reshape(-1, 1)).flatten()
y_test_preprocessed = target_preprocessor.transform(y_test.values.reshape(-1, 1)).flatten()


**4. Model Comparision**

In [95]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor())
]
for name , model in models:
  model.fit(X_train_preprocessed, y_train_preprocessed)
  y_pred_val = model.predict(X_test_preprocessed)
  mse = mean_squared_error(y_test_preprocessed, y_pred_val)
  print ("-------",name,"---------")
  print("Mean Squared Error on Test Set: " ,mse)
  correct_indices = (y_pred_val == y_test_preprocessed)

  print("Correctly Classified Examples: " , (correct_indices.sum()))
  print("Wrongly Classified Examples:: " , len(correct_indices) - (correct_indices.sum()))



------- Linear Regression ---------
Mean Squared Error on Test Set:  0.24841936874983594
Correctly Classified Examples:  0
Wrongly Classified Examples::  268
------- Decision Tree ---------
Mean Squared Error on Test Set:  0.3173765664165159
Correctly Classified Examples:  1
Wrongly Classified Examples::  267
------- Random Forest ---------
Mean Squared Error on Test Set:  0.1594224602802288
Correctly Classified Examples:  0
Wrongly Classified Examples::  268
