In [432]:
# Import Modules

from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport

import plots as pl
import metrics as met


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [433]:
# Load dataset and clean function

def load_clean_dataset(filename, data_columns, target_column):
    df = pd.read_csv(filename)

    # After the dropping duplicates, r2_score increased by 0.02
    df.drop_duplicates(inplace=True)
    
    X, y = df[data_columns].values, df[target_column].values
    return X, y.reshape(-1, 1)

In [434]:
# Replace nan values with random values based on mean and std of the column

def replace_nan_with_random(df, column, random_state = 42, **type):
    np.random.seed(random_state)
    
    mean_value = df[column].mean()
    std_value = df[column].std()
    nan_indices = df[df[column].isnull()].index
    
    if type == 'float':
        random_values = np.random.uniform(low=0, high=mean_value + 3 * std_value, size=len(nan_indices))
    else:
        random_values = np.random.randint(low=0, high=int(mean_value + 3 * std_value), size=len(nan_indices))
    
    df.loc[nan_indices, column] = random_values

In [435]:
# Imports for hyperparameter tuning

from sklearn.base import BaseEstimator, RegressorMixin

# Custom Linear Regression Model

class CustomRegression(BaseEstimator, RegressorMixin):
    def __init__(self, standardize=True, 
                 learning_rate=0.01, 
                 max_iter=1000,
                 tol=1e-4,
                 verbose=False):
        self.standardize = standardize
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose

    def get_coefficients(self):
        return self.W, self.b

    def normalize(self, X):
        mean = X.mean(axis=0)
        std = X.std(axis=0)
        X_new = (X - mean) / std
        return X_new, mean, std

    def linear(self, z):
        return z
    
    def hypothesis(self, X, W, b):
        z = np.dot(X, W) + b
        return self.linear(z)

    def cost_function(self, X, y, W, b):
        m = X.shape[0]
        cost = (1 / (2 * m)) * np.sum(np.square(self.hypothesis(X, W, b) - y))
        return cost

    def gradient(self, X, y, W, b):
        m = X.shape[0]
        W_grad = (1.0 / m) * np.sum((self.hypothesis(X, W, b) - y) * X, axis=0).reshape(-1, 1)
        b_grad = (1.0 / m) * np.sum((self.hypothesis(X, W, b) - y), axis=0).reshape(-1, 1)
        return W_grad, b_grad

    def gradient_descent(self, X, y, W, b):
        costs = []

        for i in range(self.max_iter + 1):
            W_grad, b_grad = self.gradient(X, y, W, b)
            W = W - self.learning_rate * W_grad
            b = b - self.learning_rate * b_grad
            cost = self.cost_function(X, y, W, b)
            costs.append(cost)

            if self.verbose and i % 100 == 0:
                print(f"Iteration {i} Cost: {cost}")

            if self.verbose and i > 0 and np.abs(costs[i] - costs[i - 1]) < self.tol:
                print(f"Converged at iteration {i}")

            if i > 0 and np.abs(costs[i] - costs[i - 1]) < self.tol:
                break

        return W, b, costs
    
    def fit(self, X, y):
        X_new = X.copy()
        if self.standardize:
            X_new, self.mean, self.std = self.normalize(X_new)

        self.W = np.zeros((X_new.shape[1], 1))
        self.b = np.zeros((1, 1))
        self.W, self.b, self.costs = self.gradient_descent(X_new, y, self.W, self.b)
        return self

    def predict(self, X):
        X_new = X.copy()
        if self.standardize:
            X_new = (X_new - self.mean) / self.std

        y_pred = self.hypothesis(X_new, self.W, self.b)
        return y_pred

In [436]:
# Load data

data_columns = ["age", "sex", "bmi", "children", "smoker", "region"]
target_column = "charges"
X, y = load_clean_dataset(r'C:\Users\Артем\vscode_source\MLIntro\LinearRegression\IW1\insurance.csv', data_columns, target_column)

# Map data for Linear Regression model

X[:, 1] = np.where(X[:, 1] == 'female', 0, 1)
X[:, 4] = np.where(X[:, 4] == 'no', 0, 1)
class_mapping = {'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast': 3}
X[:, 5] = [class_mapping[value] for value in X[:, 5]]
X = X.astype(float)
X

array([[19.  ,  0.  , 27.9 ,  0.  ,  1.  ,  0.  ],
       [18.  ,  1.  , 33.77,  1.  ,  0.  ,  1.  ],
       [28.  ,  1.  , 33.  ,  3.  ,  0.  ,  1.  ],
       ...,
       [18.  ,  0.  , 36.85,  0.  ,  0.  ,  1.  ],
       [21.  ,  0.  , 25.8 ,  0.  ,  0.  ,  0.  ],
       [61.  ,  0.  , 29.07,  0.  ,  1.  ,  2.  ]])

In [437]:
# Split dataset in train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1069, 6) (268, 6) (1069, 1) (268, 1)


In [438]:
# Our model

custom_model = CustomRegression(standardize=True, learning_rate=0.01, max_iter=10000, tol=1e-4, verbose=True)

In [439]:
# Fit the model

fitted_model = custom_model.fit(X_train, y_train)

Iteration 0 Cost: 150671930.72471312
Iteration 100 Cost: 36390956.357724585
Iteration 200 Cost: 20954464.03460396
Iteration 300 Cost: 18843055.25342011
Iteration 400 Cost: 18550602.738812055
Iteration 500 Cost: 18509578.021963354
Iteration 600 Cost: 18503748.74495675
Iteration 700 Cost: 18502909.57989984
Iteration 800 Cost: 18502787.164823245
Iteration 900 Cost: 18502769.06503486
Iteration 1000 Cost: 18502766.351919178
Iteration 1100 Cost: 18502765.939497102
Iteration 1200 Cost: 18502765.875898797
Converged at iteration 1243


In [440]:
# Predict the model

y_train_pred_custom = custom_model.predict(X_train)

y_test_pred_custom = custom_model.predict(X_test)

In [441]:
# Evaluate scores of our model

metrics_custom_train = met.evaluate_regression(y_train, y_train_pred_custom)

print("Train scores:\n")
print(f"MAE Score: \n{metrics_custom_train['mae']}")
print(f"MSE Score: \n{metrics_custom_train['mse']}")
print(f"RMSE Score: \n{metrics_custom_train['rmse']}")
print(f"R^2 Score: \n{metrics_custom_train['r2']}\n")

metrics_custom_test = met.evaluate_regression(y_test, y_test_pred_custom)

print("Test scores:\n")
print(f"MAE Score: \n{metrics_custom_test['mae']}")
print(f"MSE Score: \n{metrics_custom_test['mse']}")
print(f"RMSE Score: \n{metrics_custom_test['rmse']}")
print(f"R^2 Score: \n{metrics_custom_test['r2']}\n")

Train scores:

MAE Score: 
4181.313028225283
MSE Score: 
37005531.738847174
RMSE Score: 
6083.217219436371
R^2 Score: 
0.7297182858021146

Test scores:

MAE Score: 
4182.360866551045
MSE Score: 
35493445.98690371
RMSE Score: 
5957.637617957617
R^2 Score: 
0.8068447636157186





In [442]:
# Import sklearn Linear Regression model

from sklearn.linear_model import LinearRegression

In [443]:
# Our model

sklearn_model = LinearRegression()

In [444]:
# Fit the model

sklearn_model.fit(X_train, y_train)

In [445]:
# Predict the model

y_train_pred_sklearn = sklearn_model.predict(X_train)

y_test_pred_sklearn = sklearn_model.predict(X_test)

In [446]:
# Evaluate scores of our model

metrics_sklearn_train = met.evaluate_regression(y_train, y_train_pred_sklearn)

print("Train scores:\n")
print(f"MAE Score: \n{metrics_sklearn_train['mae']}")
print(f"MSE Score: \n{metrics_sklearn_train['mse']}")
print(f"RMSE Score: \n{metrics_sklearn_train['rmse']}")
print(f"R^2 Score: \n{metrics_sklearn_train['r2']}\n")

metrics_sklearn_test = met.evaluate_regression(y_test, y_test_pred_sklearn)

print("Test scores:\n")
print(f"MAE Score: \n{metrics_sklearn_test['mae']}")
print(f"MSE Score: \n{metrics_sklearn_test['mse']}")
print(f"RMSE Score: \n{metrics_sklearn_test['rmse']}")
print(f"R^2 Score: \n{metrics_sklearn_test['r2']}\n")

Train scores:

MAE Score: 
4181.321578104919
MSE Score: 
37005531.72811554
RMSE Score: 
6083.217218554302
R^2 Score: 
0.7297182858804965

Test scores:

MAE Score: 
4182.353155288298
MSE Score: 
35493102.61165051
RMSE Score: 
5957.608799816459
R^2 Score: 
0.8068466322629112





In [447]:
# Import modules for Feature selection for sklearn model

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [448]:
# Feature selection

selector = SelectKBest(score_func=f_regression, k=6)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

feature_model = LinearRegression()
feature_model.fit(X_train_selected, y_train)

train_score = feature_model.score(X_train_selected, y_train)
test_score = feature_model.score(X_test_selected, y_test)

print("Train R^2 score:", train_score)
print("Test R^2 score:", test_score)

Train R^2 score: 0.7297182858804965
Test R^2 score: 0.8068466322629111


  y = column_or_1d(y, warn=True)


In [449]:
# Import modules for Hyperparameter tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

selection_model = LinearRegression()

In [450]:
# Hyperparameter tuning - Grid Search

param_grid_grid = {
    'fit_intercept': [True, False],
}

grid_search_sklearn = GridSearchCV(estimator=selection_model, param_grid=param_grid_grid, cv=5, scoring='r2')
grid_search_sklearn.fit(X_train, y_train)

y_pred_grid_sklearn = grid_search_sklearn.predict(X_test)

# Get the best parameters and the best score
best_params_grid = grid_search_sklearn.best_params_
best_score_grid_sklearn = grid_search_sklearn.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("R^2 Score (Grid Search) Train data:", best_score_grid_sklearn)
print("R^2 Score (Grid Search) Test data:", r2_score(y_test, y_pred_grid_sklearn))

Best Parameters (Grid Search): {'fit_intercept': True}
R^2 Score (Grid Search) Train data: 0.7258515547048175
R^2 Score (Grid Search) Test data: 0.8068466322629112


In [451]:
# Hyperparameter tuning - Random Search

# Define the hyperparameters grid for Random Search
param_grid_random = {
    'fit_intercept': [True, False],
}

# Perform Random Search
random_search_sklearn = RandomizedSearchCV(estimator=selection_model, param_distributions=param_grid_random, n_iter=4, cv=5, random_state=42, scoring='r2')
random_search_sklearn.fit(X_train, y_train)

y_pred_random_sklearn = random_search_sklearn.predict(X_test)

# Get the best parameters and the best score
best_params_random = random_search_sklearn.best_params_
best_score_random_sklearn = random_search_sklearn.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("R^2 Score (Random Search) Train data:", best_score_random_sklearn)
print("R^2 Score (Random Search) Test data:", r2_score(y_test, y_pred_random_sklearn))

Best Parameters (Random Search): {'fit_intercept': True}
R^2 Score (Random Search) Train data: 0.7258515547048175
R^2 Score (Random Search) Test data: 0.8068466322629112




In [452]:
# Create an instance of CustomRegression

custom_reg = CustomRegression(standardize=True, verbose=False)

In [453]:
# Hyperparameter tuning for Custom Model by Grid Search

param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'max_iter': [1000, 2000, 3000],
    'tol': [1e-4, 1e-5, 1e-6]
}

# Perform grid search
grid_search_custom = GridSearchCV(custom_reg, param_grid, cv=5, scoring='r2')
grid_search_custom.fit(X_train, y_train)

y_pred_grid_custom = grid_search_custom.predict(X_test)

# Get the best parameters and best score
best_params = grid_search_custom.best_params_
best_score_grid_custom = grid_search_custom.best_score_

print("Best Parameters:", best_params)
print("R^2 Score (Grid Search) Train data:", best_score_grid_custom)
print("R^2 Score (Grid Search) Test data:", r2_score(y_test, y_pred_grid_custom))

Best Parameters: {'learning_rate': 0.01, 'max_iter': 1000, 'tol': 0.0001}
R^2 Score (Grid Search) Train data: 0.7258547372000121
R^2 Score (Grid Search) Test data: 0.8068290810790547


In [454]:
# Hyperparameter tuning for Custom Model by Random Search

param_random = {
    'learning_rate': [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'max_iter': [1000, 2000, 3000, 4000, 5000],
    'tol': [1e-4, 1e-5, 1e-6, 1e-7, 1e-8]
}

# Perform random search
random_search_custom = RandomizedSearchCV(custom_reg, param_random, n_iter=10, cv=5, scoring='r2', random_state=42)
random_search_custom.fit(X_train, y_train)

y_pred_random_custom = random_search_custom.predict(X_test)

# Get the best parameters and best score
best_params = random_search_custom.best_params_
best_score_random_custom = random_search_custom.best_score_

print("Best Parameters:", best_params)
print("R^2 Score (Random Search) Train data:", best_score_random_custom)
print("R^2 Score (Random Search) Test data:", r2_score(y_test, y_pred_random_custom))

Best Parameters: {'tol': 1e-05, 'max_iter': 1000, 'learning_rate': 0.01}
R^2 Score (Random Search) Train data: 0.7258547372000121
R^2 Score (Random Search) Test data: 0.8068290810790547


In [455]:
# Feature columns

feature_names = ["age", "sex", "bmi", "children", "smoker", "region"]

In [463]:
# Feature importance for custom model

coefficients_custom, bias_custom = custom_model.get_coefficients()

# Print feature importances
print("Feature importances for custom model (W):")
for i, feature_name in enumerate(feature_names):
    print(f"{feature_name}: {coefficients_custom[i][0]}")
print("Bias for all of them:")
print("       ", bias_custom[0][0])

Feature importances for custom model (W):
age: 3480.699336540979
sex: -49.77606211376367
bmi: 1890.9508397183404
children: 637.850049948175
smoker: 9224.025301807917
region: 262.26367812136704
Bias for all of them:
        13030.15493719205


In [459]:
# Feature importance for sklearn model

coefficients_sklearn = sklearn_model.coef_

# Print feature importances
print("Feature importances for sklearn model:")
for i, feature_name in enumerate(feature_names):
    print(f"{feature_name}: {coefficients_sklearn[0][i]}")

Feature importances for sklearn model:
age: 248.76407133644233
sex: -99.6953941696379
bmi: 312.60904469262135
children: 534.1208765412489
smoker: 23052.15275172916
region: 237.62514748442686


Random search appeared to be the better choice than the grid search in both cases

In [460]:
# Conclusion
# Final analysis between Custom and Regular model:

print("R^2 Score difference (Grid Search) Train data:", best_score_grid_sklearn - best_score_grid_custom)
print("R^2 Score difference (Grid Search) Test data:", r2_score(y_test, y_pred_grid_sklearn) - r2_score(y_test, y_pred_grid_custom))
print()
print("R^2 Score difference (Random Search) Train data:", best_score_random_sklearn - best_score_random_custom)
print("R^2 Score difference (Random Search) Test data:", r2_score(y_test, y_pred_random_sklearn) - r2_score(y_test, y_pred_random_custom))

R^2 Score difference (Grid Search) Train data: -3.1824951945980118e-06
R^2 Score difference (Grid Search) Test data: 1.7551183856490127e-05

R^2 Score difference (Random Search) Train data: -3.1824951945980118e-06
R^2 Score difference (Random Search) Test data: 1.7551183856490127e-05
