<a href="https://colab.research.google.com/github/RafaelAnga/MachineLearning_Bootcamp/blob/main/Insurance_charges_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost Regressor

## Part 1 - Data Preprocessing

### Importing the dataset

In [281]:
# Used to connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [282]:
 # Library necesary to access the folder route
import os
os.chdir('/content/drive/MyDrive/Machine Learning/Regression Templates/DataSets')

#Lists the available directories
os.listdir()

['Data.csv',
 'Salary_Data.csv',
 '50_Startups.csv',
 'Position_Salaries.csv',
 'insurance.csv']

In [283]:
import pandas as pd
dataset = pd.read_csv('insurance.csv')

In [284]:
!pip show scikit-learn


Name: scikit-learn
Version: 1.3.1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bigframes, fastai, imbalanced-learn, librosa, mlxtend, sentence-transformers, shap, sklearn-pandas, yellowbrick


In [285]:
!pip install scikit-learn==1.3.1



In [286]:
dataset.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Checking missing data

In [287]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [288]:
dataset.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


### Handling categorical variables

Sex column

In [289]:
dataset['sex'].unique()

array(['female', 'male'], dtype=object)

In [290]:
dataset['sex'] = dataset['sex'].apply(lambda x: 0 if x == 'female' else 1)

In [291]:
dataset.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


Smoker column

In [292]:
dataset['smoker'] = dataset['smoker'].apply(lambda x: 0 if x == 'no' else 1)

In [293]:
dataset.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


Region column

In [294]:
dataset['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [295]:
region_dummies = pd.get_dummies(dataset['region'], drop_first = True)

In [296]:
region_dummies

Unnamed: 0,northwest,southeast,southwest
0,False,False,True
1,False,True,False
2,False,True,False
3,True,False,False
4,True,False,False
...,...,...,...
1333,True,False,False
1334,False,False,False
1335,False,True,False
1336,False,False,True


In [297]:
dataset = pd.concat([region_dummies, dataset], axis=1)

In [298]:
dataset.head(5)

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,region,charges
0,False,False,True,19,0,27.9,0,1,southwest,16884.924
1,False,True,False,18,1,33.77,1,0,southeast,1725.5523
2,False,True,False,28,1,33.0,3,0,southeast,4449.462
3,True,False,False,33,1,22.705,0,0,northwest,21984.47061
4,True,False,False,32,1,28.88,0,0,northwest,3866.8552


In [299]:
dataset.drop(['region'], axis=1, inplace=True)

In [300]:
dataset.head(5)

Unnamed: 0,northwest,southeast,southwest,age,sex,bmi,children,smoker,charges
0,False,False,True,19,0,27.9,0,1,16884.924
1,False,True,False,18,1,33.77,1,0,1725.5523
2,False,True,False,28,1,33.0,3,0,4449.462
3,True,False,False,33,1,22.705,0,0,21984.47061
4,True,False,False,32,1,28.88,0,0,3866.8552


### Creating the Training Set and the Test Set

Getting the inputs and output

In [301]:
X = dataset.iloc[:, :-1].values

In [302]:
X

array([[False, False, True, ..., 27.9, 0, 1],
       [False, True, False, ..., 33.77, 1, 0],
       [False, True, False, ..., 33.0, 3, 0],
       ...,
       [False, True, False, ..., 36.85, 0, 0],
       [False, False, True, ..., 25.8, 0, 0],
       [True, False, False, ..., 29.07, 0, 1]], dtype=object)

In [303]:
y = dataset.iloc[:, -1].values

In [304]:
y

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

Getting the Training Set and the Test Set

In [305]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Part 2 - Building and training the model

### Building the model

In [306]:
import xgboost
model = xgboost.XGBRegressor(max_depth = 2, learning_rate = 0.15, n_estimators = 100)

### Training the model

In [307]:
model.fit(X_train, y_train)

### Inference

In [308]:
y_pred = model.predict(X_test)

In [309]:
y_pred

array([10692.113 ,  6598.8257, 27070.088 , 10263.456 , 35161.81  ,
        5920.7134,  2769.0156, 15700.436 ,  4032.804 , 11295.097 ,
       19065.703 ,  7946.679 ,  5309.679 , 45443.367 , 47595.21  ,
       45572.035 , 11192.861 , 44524.387 ,  9629.097 , 22989.125 ,
        6011.0527,  9066.8545,  1487.3982,  3806.2268, 11748.269 ,
       12840.985 , 14452.423 ,  6903.086 , 11930.598 ,  2077.7434,
        7893.8247, 12610.939 ,  3248.8123,  5147.643 ,  4658.1406,
        9736.747 ,  4197.0396,  8978.448 , 25479.475 , 39203.81  ,
        5754.3926,  4336.654 , 13050.681 , 13861.13  ,  7600.435 ,
       16195.428 ,  5966.2056,  6066.678 , 43344.48  ,  6687.096 ,
       15030.593 ,  2875.126 ,  7694.3877,  2321.0713, 12472.366 ,
       11738.109 ,  4528.5195, 39877.52  , 12457.397 , 12840.985 ,
       14375.656 ,  5977.974 , 16354.046 ,  8095.7153, 12495.244 ,
        6128.3066, 19318.9   , 12858.137 ,  5922.0903,  1865.8363,
        8174.2417, 10887.226 , 10743.884 ,  7492.379 , 10383.2

In [310]:
y_test

array([ 9095.06825 ,  5272.1758  , 29330.98315 ,  9301.89355 ,
       33750.2918  ,  4536.259   ,  2117.33885 , 14210.53595 ,
        3732.6251  , 10264.4421  , 18259.216   ,  7256.7231  ,
        3947.4131  , 46151.1245  , 48673.5588  , 44202.6536  ,
        9800.8882  , 42969.8527  ,  8233.0975  , 21774.32215 ,
        5080.096   ,  7441.501   ,  1256.299   ,  2755.02095 ,
       11085.5868  , 10923.9332  , 12644.589   , 18804.7524  ,
        9715.841   ,  1131.5066  , 15828.82173 , 11842.62375 ,
        2020.5523  ,  5693.4305  ,  2904.088   ,  7448.40395 ,
        2597.779   ,  7337.748   , 23887.6627  , 38709.176   ,
        4687.797   ,  2643.2685  , 11674.13    , 12124.9924  ,
        4889.9995  , 12333.828   ,  3579.8287  ,  4391.652   ,
       42124.5153  ,  4463.2051  , 13887.204   ,  1719.4363  ,
       28476.73499 ,  1708.92575 , 10594.2257  , 25333.33284 ,
        3645.0894  , 38746.3551  , 11848.141   , 10564.8845  ,
       13880.949   ,  4753.6368  , 27941.28758 ,  8017.

## Part 3: Evaluating the model

### R-Squared

In [311]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

In [312]:
r2

0.8834680522871015

### Adjusted R-Squared

In [313]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

In [314]:
adj_r2

0.879868609886703

### k-Fold Cross Validation

In [315]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
r2_scores = cross_val_score(estimator=model,
                            X=X,
                            y=y,
                            scoring='r2',
                            cv=10)

# Calculate average R^2 and standard deviation
average_r2 = r2_scores.mean()
std_r2 = r2_scores.std()


print("Cross-Validation Results:")
print("Average R-Squared: {:.3f}".format(average_r2))
print("Standard Deviation of R-Squared: {:.3f}".format(std_r2))

Cross-Validation Results:
Average R-Squared: 0.860
Standard Deviation of R-Squared: 0.044


Grid Search for optimal Parameters

In [317]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score

# Define the model
model = xgboost.XGBRegressor()

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],       # Number of trees
    'max_depth': [3, 5, 7],              # Maximum depth of trees
    'learning_rate': [0.01, 0.1, 0.2],   # Step size shrinkage
    'subsample': [0.8, 1.0],             # Fraction of samples used for fitting
    'colsample_bytree': [0.8, 1.0],      # Fraction of features used for fitting
}

# Define the scoring metric
scorer = make_scorer(r2_score)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,          # Number of cross-validation folds
    verbose=2,      # For detailed output
    n_jobs=-1       # Use all available CPUs
)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and the best R2 score
print("Best Parameters:", grid_search.best_params_)
print("Best R-Squared Score:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
Best R-Squared Score: 0.8626829811315135
