In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to the CSV file
from google.colab import drive
csv_file_path = '/content/drive/MyDrive/Medicalpremium.csv'  # Replace with the actual path to your CSV file

# Read the CSV file
data = pd.read_csv(csv_file_path)

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


In [None]:
X = data.drop('PremiumPrice', axis=1)
y = data['PremiumPrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Create an instance of XGBRegressor with the given parameters
model = xgb.XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',  # Comment out if no GPU is available
        max_depth=7,
        gamma=0,
        learning_rate=0.05,
        n_estimators=100,
        subsample=0.6,
        random_state = 69)
# Fit the model to the training data
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
print(y_pred)

[23028.96  31596.842 23272.256 16639.611 27858.807 23428.87  23193.293
 23398.047 28022.242 23830.715 23381.383 22151.842 14946.392 22661.53
 22583.543 27768.984 27439.562 27810.598 28068.182 15108.855 28824.85
 22986.879 33136.14  28864.992 15177.461 31719.764 28945.441 16047.364
 15442.913 23002.855 17938.723 28501.068 23287.336 27749.184 27928.87
 22727.715 20996.6   14919.991 31477.55  27163.787 21418.441 35809.902
 15359.184 16382.786 27654.91  16156.015 15775.154 35867.703 23059.176
 30643.78  18286.77  22869.037 37596.176 30651.756 14973.706 28966.
 22828.514 23739.83  25913.541 22972.816 23184.285 23522.55  32632.531
 30121.564 27808.9   28766.594 33055.344 31784.346 22916.316 28054.727
 24560.178 28415.34  19287.85  15340.928 27823.021 23501.074 23335.154
 25762.37  28071.87  23055.44  14957.085 28760.912 14870.683 27858.89
 23578.926 27666.445 23356.643 16588.56  22952.52  21305.273 28622.936
 36972.297 29165.326 25138.588 22894.623 32640.566 28141.701 28118.8
 15106.832 2540

In [None]:
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(mae)
print(math.sqrt(mse))

1288.58824645749
2875.5670114054833


In [None]:
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(
    base_estimator=model,
    n_estimators=10,       # Number of bootstrap samples
    bootstrap=True,        # Use bootstrapped samples
    n_jobs=-1,
    random_state = 69# Use all available cores
)

In [None]:
bagging_model.fit(X_train_scaled, y_train)



In [None]:
y_pred2 = bagging_model.predict(X_test_scaled)

In [None]:
mae = mean_absolute_error(y_test, y_pred2)
mse = mean_squared_error(y_test, y_pred2)
print(mae)
print(math.sqrt(mse))

1382.0306727606276
2870.824517779309


In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=10, shuffle=True, random_state=69)
cv_mae = []
cv_mse = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the BaggingRegressor model
    bagging_model.fit(X_train, y_train)

    # Predictions on the test set
    y_pred = bagging_model.predict(X_test)

    # Compute metrics for this fold
    fold_mae = mean_absolute_error(y_test, y_pred)
    fold_mse = mean_squared_error(y_test, y_pred)

    cv_mae.append(fold_mae)
    cv_mse.append(fold_mse)

# Calculate mean of metrics across folds
mean_cv_mae = np.mean(cv_mae)
mean_cv_mse = np.mean(cv_mse)

print('Mean Cross-validated MAE:', mean_cv_mae)
print('Mean Cross-validated RMSE:', np.sqrt(mean_cv_mse))



Mean Cross-validated MAE: 1272.4791049042722
Mean Cross-validated RMSE: 2704.132479594752
