**1. Load the Dataset:**

In [15]:
import pandas as pd

file_path='insurance.csv'
database = pd.read_csv(file_path)

**2. Prepare Train-Validation-Test Split:**

In [16]:
from sklearn.model_selection import train_test_split
X = database.drop('charges', axis=1)
Y = database['charges']

X_val, X_test, y_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_val, y_val, test_size=0.25, random_state=42)

**3. Preprocessing the Data**

---
  After Preprocessing the data will be ordered in this order

  ['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no',
  'smoker_yes', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']

---






In [17]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestRegressor




numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']

#StandarScaler() --> Takes the numerical values and standardize them all to have a value of Mean 0 and Standared Deviation 1
numerical_transformer = StandardScaler()
#OneHotEncode()  --> Takes the Categrocal values(string) and turn them to binary number based on how many option for the column
# Example. Sex column --> have only two options 1-Male 2 Female --> then it will turn it to (10) or (01)
categorical_transformer = OneHotEncoder()

#ColumnTransofrmer is benificial when you have both numerical and categorical values . It compines them
preprocessor = ColumnTransformer(
    [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

target_preprocessor = StandardScaler()
y_train_preprocessed = target_preprocessor.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_val_preprocessed = target_preprocessor.transform(y_val.values.reshape(-1, 1)).flatten()
y_test_preprocessed = target_preprocessor.transform(y_test.values.reshape(-1, 1)).flatten()


**4. Model Comparision**

In [18]:

models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor())
]
for name , model in models:
  model.fit(X_train_preprocessed, y_train_preprocessed)
  y_pred_val = model.predict(X_test_preprocessed)
  mse = mean_squared_error(y_test_preprocessed, y_pred_val)
  print ("-------",name,"---------")
  print("Mean Squared Error on Test Set: " ,mse)
  correct_indices = (y_pred_val == y_test_preprocessed)

  print("Correctly Classified Examples: " , (correct_indices.sum()))
  print("Wrongly Classified Examples:: " , len(correct_indices) - (correct_indices.sum()))



------- Linear Regression ---------
Mean Squared Error on Test Set:  0.24841936874983575
Correctly Classified Examples:  0
Wrongly Classified Examples::  268
------- Decision Tree ---------
Mean Squared Error on Test Set:  0.33066351961573487
Correctly Classified Examples:  1
Wrongly Classified Examples::  267


------- Random Forest ---------
Mean Squared Error on Test Set:  0.14995277266033005
Correctly Classified Examples:  0
Wrongly Classified Examples::  268


**5. Different hyper-parameters.**

In [19]:


# Define the models with their respective hyperparameters for tuning
models = [
    ('Linear Regression', LinearRegression(), {}),
    ('Decision Tree', DecisionTreeRegressor(), {'max_depth': [None, 5, 10, 15]}),
    ('Random Forest', RandomForestRegressor(), {'n_estimators': [50, 100, 150]})
]

for name, model, params in models:
    grid = GridSearchCV(model, params, scoring='neg_mean_squared_error', cv=5)
    grid.fit(X_train_preprocessed, y_train_preprocessed)
    
    best_model = grid.best_estimator_
    best_params = grid.best_params_
    
    y_pred_val = best_model.predict(X_test_preprocessed)
    mse = mean_squared_error(y_test_preprocessed, y_pred_val)
    
    print("-------", name, "---------")
    print("Best Parameters:", best_params)
    print("Mean Squared Error on Test Set:", mse)
    correct_indices = (y_pred_val == y_test_preprocessed)
    print("Correctly Classified Examples:", correct_indices.sum())
    print("Wrongly Classified Examples:", len(correct_indices) - correct_indices.sum())


------- Linear Regression ---------
Best Parameters: {}
Mean Squared Error on Test Set: 0.24841936874983575
Correctly Classified Examples: 0
Wrongly Classified Examples: 268
------- Decision Tree ---------
Best Parameters: {'max_depth': 5}
Mean Squared Error on Test Set: 0.16230625441672308
Correctly Classified Examples: 0
Wrongly Classified Examples: 268
------- Random Forest ---------
Best Parameters: {'n_estimators': 150}
Mean Squared Error on Test Set: 0.1513500844285254
Correctly Classified Examples: 0
Wrongly Classified Examples: 268


**6. calculate precision_score, recall_score, f1_score.**

In [20]:
# Define a threshold for classification
threshold = 0.5  

for name, model, params in models:
    
    y_pred_val = best_model.predict(X_test_preprocessed)
    
    # Convert regression outputs into binary classes using threshold
    y_pred_classes = (y_pred_val > threshold).astype(int)
    y_test_classes = (y_test_preprocessed > threshold).astype(int)
    
    precision = precision_score(y_test_classes, y_pred_classes)
    recall = recall_score(y_test_classes, y_pred_classes)
    f1 = f1_score(y_test_classes, y_pred_classes)
    
    print("-------", name, "---------")
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)


------- Linear Regression ---------
Best Parameters: {'n_estimators': 150}
Precision: 0.8363636363636363
Recall: 0.7931034482758621
F1 Score: 0.8141592920353982
------- Decision Tree ---------
Best Parameters: {'n_estimators': 150}
Precision: 0.8363636363636363
Recall: 0.7931034482758621
F1 Score: 0.8141592920353982
------- Random Forest ---------
Best Parameters: {'n_estimators': 150}
Precision: 0.8363636363636363
Recall: 0.7931034482758621
F1 Score: 0.8141592920353982
