In [1]:
import math
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
path = "C:/Users/tokud/Projects/Insurance Linear Regression/training.csv"

df = pd.read_csv(path)
print(df.shape)
df.head()

(100000, 11)


Unnamed: 0,transaction_id,amount,transaction_date,month_index,product_Flood,product_Homeowners,product_Renters,region_Midwest,region_Northeast,region_South,region_West
0,1,21.52,2024-01-21,1,0,0,1,0,0,0,1
1,2,12.76,2024-03-28,3,0,0,1,0,1,0,0
2,3,20.46,2024-11-04,11,0,0,1,0,1,0,0
3,4,91.27,2024-07-10,7,1,0,0,0,0,1,0
4,5,197.09,2024-09-09,9,1,0,0,0,0,1,0


In [3]:
features = ['month_index', 'product_Flood', 'product_Homeowners', 'product_Renters',
       'region_Midwest', 'region_Northeast', 'region_South', 'region_West']

X = df[features]
y = df['amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, "\n")
print(X_test.shape, "\n")
print(y_train.shape, "\n")
print(y_test.shape)

(80000, 8) 

(20000, 8) 

(80000,) 

(20000,)


In [4]:
model_dict = [
    {
        'model': LinearRegression(),
        'params': {
            'copy_X': [True, False],
            'fit_intercept': [True, False],
            'positive': [True, False],
            'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
        }
    },
    {
        'model': xgb.XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist'
            ),
        'params': {
            'max_depth': randint(3, 15),
            'learning_rate': np.logspace(-3, 0, 100),
            'n_estimators': randint(100, 1000),
            'subsample': uniform(0.6, 0.4),
            'colsample_bytree': uniform(0.6, 0.4)
        }
    },
    {
    
        'model': Lasso(),
        'params': {
            'alpha': np.logspace(-4, 2, 50),
            'fit_intercept': [True, False],
            'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
        }
    },
    {
        'model': Ridge(),
        'params': {
            'alpha': np.logspace(-4, 3, 50),
            'fit_intercept': [True, False],
            'tol': [1e-2, 1e-3, 1e-4, 1e-5],
            'solver': ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
        }
    },
    {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': randint(3, 30),
            'min_samples_split': randint(2, 20),
            'min_samples_leaf': randint(1, 10),
            'max_features': ['sqrt', 'log2', None, 0.3, 0.5, 0.8],
            'max_leaf_nodes': [None] + list(range(10, 200, 20)),
            'splitter': ['best', 'random']
        }
    },
    {
        'model': RandomForestRegressor(),
        'params': {
            'max_depth': randint(3, 25),
            'min_samples_leaf': randint(1, 10),
            'min_samples_split': randint(2, 20),
            'max_features': uniform(0.3, 0.7),
            'n_estimators': randint(100, 500)
        }
    }
]

In [5]:
def prGreen(s): print("\033[92m {}\033[00m".format(s))
def prCyan(s): print("\033[96m {}\033[00m".format(s))

best_estimators = {}

for model_type in model_dict:
    model_name = type(model_type['model']).__name__
    print(f"---Tuning {model_name} with RandomizedSearchCV")
    
    randomized_search = RandomizedSearchCV(
        estimator = model_type['model'],
        param_distributions = model_type['params'],
        cv=5,
        n_iter=50,
        random_state=42,
        n_jobs=-1
    )
    
    randomized_search.fit(X_train, y_train)
    
    best_estimators[model_name] = {
        'best_estimator': randomized_search.best_estimator_,
        'best_score': randomized_search.best_score_,
        'best_params': randomized_search.best_params_
    }
    
    prGreen(f"Best score for {model_name}: {randomized_search.best_score_:.4f}")
    prCyan(f"Best parameters for {model_name}: {randomized_search.best_params_}\n")
    
    
for name, info in best_estimators.items():
    print(f"Predicting with best {name}")
    y_pred = info['best_estimator'].predict(X_test)
    mse_new = mean_squared_error(y_test, y_pred)
    r2_new = r2_score(y_test, y_pred)
    
    prGreen(f"Mean Squared Error: {math.sqrt(mse_new)}")
    prCyan(f"R-Squared: {(r2_new)*100}\n")


---Tuning LinearRegression with RandomizedSearchCV




[92m Best score for LinearRegression: 0.9188[00m
[96m Best parameters for LinearRegression: {'tol': 0.01, 'positive': True, 'fit_intercept': False, 'copy_X': True}
[00m
---Tuning XGBRegressor with RandomizedSearchCV
[92m Best score for XGBRegressor: 0.9187[00m
[96m Best parameters for XGBRegressor: {'colsample_bytree': np.float64(0.9268888800804863), 'learning_rate': np.float64(0.06135907273413173), 'max_depth': 3, 'n_estimators': 101, 'subsample': np.float64(0.9985014799031697)}
[00m
---Tuning Lasso with RandomizedSearchCV
[92m Best score for Lasso: 0.9188[00m
[96m Best parameters for Lasso: {'tol': 0.01, 'fit_intercept': True, 'alpha': np.float64(0.0007196856730011522)}
[00m
---Tuning Ridge with RandomizedSearchCV
[92m Best score for Ridge: 0.9188[00m
[96m Best parameters for Ridge: {'tol': 0.001, 'solver': 'svd', 'fit_intercept': True, 'alpha': np.float64(0.3727593720314938)}
[00m
---Tuning DecisionTreeRegressor with RandomizedSearchCV
[92m Best score for DecisionTr

In [6]:
for key, value in best_estimators.items():
    print(f"{key}: {value}")

LinearRegression: {'best_estimator': LinearRegression(fit_intercept=False, positive=True, tol=0.01), 'best_score': np.float64(0.9188155224594288), 'best_params': {'tol': 0.01, 'positive': True, 'fit_intercept': False, 'copy_X': True}}
XGBRegressor: {'best_estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=np.float64(0.9268888800804863), device=None,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, feature_weights=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None,
             learning_rate=np.float64(0.06135907273413173), max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             m

In [12]:
model_dict = [
    {
        'model': LinearRegression(),
        'params': {
            'copy_X': [True],
            'fit_intercept': [True, False],
            'positive': [True],
            'tol': [1e-2]
        }
    },
    {
        'model': xgb.XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist'
            ),
        'params': {
            'max_depth': [3, 4],
            'learning_rate': [0.05, 0.06],
            'n_estimators': [75, 100, 125],
            'subsample': [0.98, 0.99],
            'colsample_bytree': [0.92, 0.95]
        }
    },
    {
    
        'model': Lasso(),
        'params': {
            'alpha': [0.0007, 0.001],
            'fit_intercept': [True],
            'tol': [1e-2, 1e-3]
        }
    },
    {
        'model': Ridge(),
        'params': {
            'alpha': [0.37, 0.5],
            'fit_intercept': [True],
            'tol': [1e-3, 1e-4],
            'solver': ["svd"]
        }
    },
    {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [3, 4],
            'min_samples_split': [19, 20],
            'min_samples_leaf': [8, 9, 10],
            'max_features': [None],
            'max_leaf_nodes': [140, 150, 160],
            'splitter': ['best']
        }
    },
    {
        'model': RandomForestRegressor(),
        'params': {
            'max_depth': [3, 4],
            'min_samples_leaf': [2, 3],
            'min_samples_split': [13, 14],
            'max_features': [0.74, 0.75, 0.76],
            'n_estimators': [317, 325]
        }
    }
]

In [13]:
best_estimators_grid = {}

for model_type in model_dict:
    model_name = type(model_type['model']).__name__
    print(f"---Tuning {model_name} with GridSearchCV")
    
    grid_search = GridSearchCV(
        estimator = model_type['model'],
        param_grid = model_type['params'],
        cv=5,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    best_estimators_grid[model_name] = {
        'best_estimator': grid_search.best_estimator_,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    }
    
    prGreen(f"Best score for {model_name}: {grid_search.best_score_:.4f}")
    prCyan(f"Best parameters for {model_name}: {grid_search.best_params_}\n")
    
    
for name, info in best_estimators_grid.items():
    print(f"Predicting with best {name}")
    y_pred = info['best_estimator'].predict(X_test)
    mse_new = mean_squared_error(y_test, y_pred)
    r2_new = r2_score(y_test, y_pred)
    
    prGreen(f"Mean Squared Error: {math.sqrt(mse_new)}")
    prCyan(f"R-Squared: {(r2_new)*100}\n")

---Tuning LinearRegression with GridSearchCV
[92m Best score for LinearRegression: 0.9188[00m
[96m Best parameters for LinearRegression: {'copy_X': True, 'fit_intercept': False, 'positive': True, 'tol': 0.01}
[00m
---Tuning XGBRegressor with GridSearchCV
[92m Best score for XGBRegressor: 0.9187[00m
[96m Best parameters for XGBRegressor: {'colsample_bytree': 0.92, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 125, 'subsample': 0.99}
[00m
---Tuning Lasso with GridSearchCV
[92m Best score for Lasso: 0.9188[00m
[96m Best parameters for Lasso: {'alpha': 0.001, 'fit_intercept': True, 'tol': 0.01}
[00m
---Tuning Ridge with GridSearchCV
[92m Best score for Ridge: 0.9188[00m
[96m Best parameters for Ridge: {'alpha': 0.37, 'fit_intercept': True, 'solver': 'svd', 'tol': 0.001}
[00m
---Tuning DecisionTreeRegressor with GridSearchCV
[92m Best score for DecisionTreeRegressor: 0.9187[00m
[96m Best parameters for DecisionTreeRegressor: {'max_depth': 3, 'max_features': None,

In [14]:
for key, value in best_estimators_grid.items():
    print(key, value)

LinearRegression {'best_estimator': LinearRegression(fit_intercept=False, positive=True, tol=0.01), 'best_score': np.float64(0.9188155224594288), 'best_params': {'copy_X': True, 'fit_intercept': False, 'positive': True, 'tol': 0.01}}
XGBRegressor {'best_estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.92, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=125,
             n_jobs=None, num_parallel_tree

In [19]:
best_model = best_estimators_grid['Lasso']['best_estimator']

In [20]:
import joblib

directory = "C:/Users/tokud/Projects/Insurance Linear Regression/best_model.pkl"

joblib.dump(best_model, directory)

['C:/Users/tokud/Projects/Insurance Linear Regression/best_model.pkl']