In [1]:
import pandas as pd
from sqlalchemy import create_engine
import urllib.parse

def load_data_from_postgres(
    table_name,
    db_user='postgres',
    db_password='your_password',
    db_host='localhost',
    db_port='5432',
    db_name='your_db'
):
    """
    Load the dataset from a PostgreSQL table and validate its structure.

    Args:
        table_name (str): Name of the table in PostgreSQL.
        db_user (str): Database username.
        db_password (str): Database password.
        db_host (str): Hostname of the database server.
        db_port (str): Port number of the database server.
        db_name (str): Name of the PostgreSQL database.

    Returns:
        pandas.DataFrame: Loaded dataset.

    Raises:
        ValueError: If required columns are missing or the dataset is empty.
    """

    # Encode password to be URL-safe
    encoded_password = urllib.parse.quote_plus(db_password)

    # Create database connection string
    connection_str = f'postgresql+psycopg2://{db_user}:{encoded_password}@{db_host}:{db_port}/{db_name}'
    engine = create_engine(connection_str)

    # Load data from PostgreSQL table
    df = pd.read_sql_table(table_name, con=engine)

    # Check if dataset is empty
    if df.empty:
        raise ValueError("Dataset is empty")

    # Define required columns
    required_columns = [
        'job_title', 'experience_level', 'employment_type', 'company_size',
        'company_location', 'remote_ratio', 'salary_currency', 'years_experience', 'base_salary',
        'bonus', 'stock_options', 'total_salary', 'salary_in_usd', 'currency',
        'conversion_rate', 'adjusted_total_usd'
    ]

    # Check for missing columns
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")

    return df


if __name__ == "__main__":
    try:
        df = load_data_from_postgres(
            table_name='salary_data',
            db_user='postgres',
            db_password='1{Rithwik}',
            db_host='localhost',
            db_port='5432',
            db_name='postgres'
        )
        print("Dataset loaded successfully with shape:", df.shape)
        print(df.head())
    except Exception as e:
        print(f"Error loading data: {e}")

Dataset loaded successfully with shape: (100000, 18)
            job_title experience_level employment_type company_size  \
0        Data Analyst              Mid        Contract       Medium   
1     DevOps Engineer              Mid        Contract        Small   
2  Research Scientist             Lead            None       Medium   
3       Software Engr             Lead       Full-time        Large   
4       Software Engr             Lead          Intern        Large   

  company_location  remote_ratio salary_currency  years_experience  \
0          Germany             0             INR                13   
1            India           100             GBP                 9   
2          Germany             0             EUR                19   
3            India            50             INR                 7   
4          Germany           100             INR                10   

     base_salary  bonus  stock_options   total_salary  salary_in_usd currency  \
0   68407.451747  

In [2]:
def clean_data(df):
    # Clean salary columns by removing leading single quotes and converting to numeric
    salary_columns = ['base_salary', 'total_salary', 'salary_in_usd', 'adjusted_total_usd']
    for col in salary_columns:
        try:
            # Remove leading single quote and convert to float
            df[col] = df[col].astype(str).str.lstrip("'").astype(float)
        except ValueError as e:
            raise ValueError(f"Invalid values in {col}: unable to convert to numeric after removing single quotes. Error: {e}")
        
    # Remove education and skills columns if they exist
    df = df.drop(columns=['education', 'skills'], errors='ignore')

    # many duplicate rows found so dropping
    df.drop_duplicates()

    # Compute modes
    exp_mode = df['experience_level'].mode().iloc[0]
    emp_mode = df['employment_type'].mode().iloc[0]

    # Fill missing/unknown values with mode
    df['experience_level'].fillna(exp_mode, inplace=True)
    df['employment_type'].fillna(emp_mode, inplace=True)

    print(f" experience_level mode used: {exp_mode}")
    print(f" employment_type mode used: {emp_mode}")


    # Mapping inconsistent job titles to standard ones
    job_title_mapping = {
        'Software Engr': 'Software Engineer',
        'Sofware Engneer': 'Software Engineer',
        'Softwre Engineer': 'Software Engineer',
        
        'Data Scienist': 'Data Scientist',
        'Data Scntist': 'Data Scientist',
        'Dt Scientist': 'Data Scientist',
        
        'ML Engr': 'Machine Learning Engineer',
        'Machine Learning Engr': 'Machine Learning Engineer',
        'ML Enginer': 'Machine Learning Engineer',
        'ML Engineer': 'Machine Learning Engineer'
    }

    # Apply the mapping
    df['job_title'] = df['job_title'].replace(job_title_mapping)

    return df


if __name__ == "__main__":
    df_cleaned = clean_data(df)

 experience_level mode used: Mid
 employment_type mode used: Part-time


In [3]:

import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    PowerTransformer, StandardScaler, OneHotEncoder, OrdinalEncoder
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats.mstats import winsorize

# Winsorization utility
def winsorize_columns(df, columns, limits=(0.01, 0.01)):
    for col in columns:
        try:
            df[col] = winsorize(df[col], limits=limits)
        except Exception as e:
            print(f"Could not winsorize column '{col}': {e}")
    return df

# Custom Yeo-Johnson transformer for target
class YeoJohnsonTargetTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pt = PowerTransformer(method='yeo-johnson')

    def fit(self, y):
        y = np.array(y).reshape(-1, 1)
        self.pt.fit(y)
        return self

    def transform(self, y):
        y = np.array(y).reshape(-1, 1)
        return self.pt.transform(y).flatten()

    def inverse_transform(self, y_transformed):
        y_transformed = np.array(y_transformed).reshape(-1, 1)
        return self.pt.inverse_transform(y_transformed).flatten()

    def save(self, path):
        joblib.dump(self.pt, path)

    def load(self, path):
        self.pt = joblib.load(path)

# Main preprocessing function
def preprocess_data(df, save_dir="pkl_joblib_files"):
    os.makedirs(save_dir, exist_ok=True)

    target_col = 'adjusted_total_usd'
    numeric_cols = ['years_experience', 'base_salary', 'bonus', 'stock_options', 'total_salary', 'salary_in_usd']
    categorical_cols = ['salary_currency', 'currency']
    ordinal_cols = ['experience_level', 'company_size']
    ordinal_map = [
        ['Junior', 'Mid', 'Senior', 'Lead'],
        ['Small', 'Medium', 'Large']
    ]

    # Step 1: Split data
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 2: Winsorize
    X_train = winsorize_columns(X_train.copy(), numeric_cols)
    X_test = winsorize_columns(X_test.copy(), numeric_cols)

    # Step 3: Target transformation (Yeo-Johnson)
    y_transformer = YeoJohnsonTargetTransformer()
    y_transformer.fit(y_train)
    y_train_trans = y_transformer.transform(y_train)
    y_test_trans = y_transformer.transform(y_test)

    # Save the target transformer
    y_transformer.save(os.path.join(save_dir, "yeojohnson_target_transformer.pkl"))

    # Step 4: Column setup
    ordinal_features = [col for col in ordinal_cols if col in X.columns]
    ordinal_ordering = [ordering for col, ordering in zip(ordinal_cols, ordinal_map) if col in X.columns]
    nominal_features = [col for col in categorical_cols if col not in ordinal_features]

    # Step 5: Build transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('yeojohnson', PowerTransformer(method='yeo-johnson')),
        ('scaler', StandardScaler())
    ])

    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=ordinal_ordering))
    ])

    nominal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    # Step 6: Combine all
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ])

    # Step 7: Transform X
    X_train_trans = preprocessor.fit_transform(X_train)
    X_test_trans = preprocessor.transform(X_test)

    #  Save the full preprocessor
    joblib.dump(preprocessor, os.path.join(save_dir, "preprocessor.pkl"))

    # Step 8: Rebuild DataFrames
    encoded_nominal_cols = preprocessor.named_transformers_['nom']['onehot'].get_feature_names_out(nominal_features)
    feature_names = numeric_cols + ordinal_features + list(encoded_nominal_cols)

    X_train_df = pd.DataFrame(X_train_trans, columns=feature_names, index=X_train.index)
    X_test_df = pd.DataFrame(X_test_trans, columns=feature_names, index=X_test.index)

    print(" Preprocessing completed and saved:")
    print(f" Preprocessor: {save_dir}/preprocessor.pkl")
    print(f" Yeo-Johnson for y: {save_dir}/yeojohnson_target_transformer.pkl")
    print(" X_train shape:", X_train_df.shape)
    print(" y_train (transformed) shape:", y_train_trans.shape)

    return X_train_df, X_test_df, y_train_trans, y_test_trans, y_transformer


if __name__=="__main__":
    X_train, X_test, y_train, y_test, y_transformer = preprocess_data(df_cleaned)


 Preprocessing completed and saved:
 Preprocessor: pkl_joblib_files/preprocessor.pkl
 Yeo-Johnson for y: pkl_joblib_files/yeojohnson_target_transformer.pkl
 X_train shape: (80000, 18)
 y_train (transformed) shape: (80000,)




In [None]:
import os
import joblib
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score

def train_evaluate_and_select_model(X_train, y_train, X_test, y_test, save_dir="pkl_joblib_files"):
    os.makedirs(save_dir, exist_ok=True)

    # Model grid
    models = {
        'ridge': {
            'model': Ridge(),
            'params': {'alpha': [0.1, 1.0, 10.0]}
        },
        'lasso': {
            'model': Lasso(),
            'params': {'alpha': [0.001, 0.01, 0.1, 1.0]}
        },
        'random_forest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5]
            }
        },
        'xgboost': {
            'model': XGBRegressor(random_state=42),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            }
        },
        'lightgbm': {
            'model': LGBMRegressor(random_state=42),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.05, 0.1],
                'max_depth': [-1, 5]
            }
        }
    }

    best_model = None
    best_score = float('inf')
    best_name = None

    for name, config in models.items():
        print(f"Training {name}...")

        grid = GridSearchCV(config['model'], config['params'],
                            cv=5, scoring='neg_root_mean_squared_error',
                            n_jobs=-1, verbose=0)
        grid.fit(X_train, y_train)

        rmse = -grid.best_score_
        print(f"{name} best RMSE: {rmse:.4f} | Best Params: {grid.best_params_}")

        if rmse < best_score:
            best_score = rmse
            best_model = grid.best_estimator_
            best_name = name

    # Evaluate on test set
    y_pred_test = best_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_r2 = r2_score(y_test, y_pred_test)

    print(f"\nBest Model: {best_name}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Test R² Score: {test_r2:.4f}")

    # Save best model
    model_path = os.path.join(save_dir, "model.pkl")
    joblib.dump(best_model, model_path)
    print(f"Model saved to: {model_path}")

    return best_model, y_pred_test


if __name__=="__main__":
    best_model, y_pred_test = train_evaluate_and_select_model(X_train, y_train, X_test, y_test, save_dir="pkl_joblib_files")

Training ridge...
ridge best RMSE: 0.2400 | Best Params: {'alpha': 1.0}
Training lasso...
lasso best RMSE: 0.2404 | Best Params: {'alpha': 0.001}
Training random_forest...
random_forest best RMSE: 0.0809 | Best Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Training xgboost...
xgboost best RMSE: 0.0831 | Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Training lightgbm...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002850 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1324
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 18
[LightGBM] [Info] Start training from score -0.000000
lightgbm best RMSE: 0.0800 | Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}

Best Model: lightgbm
Test RMSE: 0.6375
Test R² Score: 0.9452
Mod

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

def save_train_test_for_evidently(df, target_column="adjusted_total_usd", test_size=0.2, random_state=42, output_dir="data"):
    """
    Splits a DataFrame into train/test and saves both to CSV including X and y.
    This is used for Evidently drift detection.
    
    Args:
        df (pd.DataFrame): Full dataset with features + target column.
        target_column (str): Name of the target column.
        test_size (float): Fraction of test data.
        random_state (int): Seed for reproducibility.
        output_dir (str): Folder where train/test files are saved.
    """
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in DataFrame.")

    os.makedirs(output_dir, exist_ok=True)

    # Split
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    # Save
    train_path = os.path.join(output_dir, "train.csv")
    test_path = os.path.join(output_dir, "test.csv")

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

    print(f"✅ Train shape: {train_df.shape}, saved to {train_path}")
    print(f"✅ Test shape: {test_df.shape}, saved to {test_path}")

if __name__=="__main__":
    save_train_test_for_evidently(df, target_column="adjusted_total_usd")

✅ Train shape: (80000, 18), saved to data\train.csv
✅ Test shape: (20000, 18), saved to data\test.csv


In [11]:
import os
import joblib
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.utils.validation import check_is_fitted
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, RegressorMixin

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# with SHAP
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test, save_dir="saved_models",shap_dir="shap_outputs"):
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(shap_dir, exist_ok=True)

    mlflow.set_tracking_uri("http://localhost:5000")
    mlflow.set_experiment("Instilit Salary Prediction")

    results = []
    best_estimators = {}

    for name, mp in models.items():
        print(f"\nTraining {name}...")
        grid = GridSearchCV(mp['model'], mp['params'], cv=3, scoring='r2', n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)

        # Metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        metrics = {
        "mae": mae,
        "rmse": rmse,
        "r2": r2,
        "mape": mape
    }

        print(
            f"{name} | MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.3f} | MAPE: {mape:.2f}% | Best Params: {grid.best_params_}")

        results.append({
            "model": name,
            "best_params": grid.best_params_,
            "mae": mae,
            "rmse": rmse,
            "r2": r2,
            "mape": mape
        })

        # Save the best estimator for this model
        model_filename = f"{save_dir}/{name}_best.pkl"
        joblib.dump(grid.best_estimator_, model_filename)
        print(f"Saved {name} model to {model_filename}")

        # Store best estimator in dictionary
        best_estimators[name] = grid.best_estimator_

        # 🧪 MLflow logging
        with mlflow.start_run(run_name=name) as run:
            mlflow.log_params(grid.best_params_)
            mlflow.log_metrics(metrics)

            input_example = X_test.iloc[:3] if hasattr(X_test, "iloc") else X_test[:3]
            signature = infer_signature(X_test, y_pred)

            #mlflow.sklearn.log_model(grid.best_estimator_, "model")

            mlflow.sklearn.log_model(
                grid.best_estimator_,
                artifact_path="model",  # use 'name' instead of 'artifact_path'
                input_example=input_example,
                signature=signature
                )

            # ✅ SHAP Explanation
            try:
                explainer = shap.Explainer(grid.best_estimator_, X_val)
                shap_values = explainer(X_val)

                # Plot and save SHAP summary
                shap_path = os.path.join(shap_dir, f"{name}_shap_summary.png")
                plt.figure()
                shap.summary_plot(shap_values, X_val, show=False)
                plt.savefig(shap_path, bbox_inches='tight')
                plt.close()
                mlflow.log_artifact(shap_path, artifact_path="shap_plots")
                print(f"✅ SHAP saved & logged: {shap_path}")

            except Exception as e:
                print(f"⚠️ SHAP failed for {name}: {e}")

    results_df = pd.DataFrame(results)
    print("\n📊 All Model Validation Metrics:")
    print(results_df[["model", "mae", "rmse", "r2", "mape"]].to_string(index=False))


    return results_df, best_estimators

if __name__=="__main__":
    # Model grid
    models = {
        'ridge': {
            'model': Ridge(),
            'params': {'alpha': [0.1, 1.0, 10.0]}
        },
        'lasso': {
            'model': Lasso(),
            'params': {'alpha': [0.001, 0.01, 0.1, 1.0]}
        },
        'random_forest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5]
            }
        },
        'xgboost': {
            'model': XGBRegressor(random_state=42),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            }
        },
        'lightgbm': {
            'model': LGBMRegressor(random_state=42),
            'params': {
                'n_estimators': [100],
                'learning_rate': [0.05, 0.1],
                'max_depth': [-1, 5]
            }
        }
    }
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    train_and_evaluate_models(models, X_train, y_train, X_test, y_test)


Training ridge...
ridge | MAE: 22.24 | RMSE: 33.89 | R2: -153.979 | MAPE: 11.75% | Best Params: {'alpha': 1.0}
Saved ridge model to saved_models/ridge_best.pkl


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.22it/s]
2025/07/07 07:24:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run ridge at: http://localhost:5000/#/experiments/576117582094812384/runs/7c3e6bd243ff409e993a16067a105dca.
2025/07/07 07:24:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/576117582094812384.


✅ SHAP saved & logged: shap_outputs\ridge_shap_summary.png

Training lasso...
lasso | MAE: 21.35 | RMSE: 32.53 | R2: -141.861 | MAPE: 11.32% | Best Params: {'alpha': 0.001}
Saved lasso model to saved_models/lasso_best.pkl


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.28it/s]
2025/07/07 07:24:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run lasso at: http://localhost:5000/#/experiments/576117582094812384/runs/54105ef40c4b49259ce378f83c7bcbf6.
2025/07/07 07:24:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/576117582094812384.


✅ SHAP saved & logged: shap_outputs\lasso_shap_summary.png

Training random_forest...
random_forest | MAE: 0.35 | RMSE: 0.65 | R2: 0.942 | MAPE: 0.10% | Best Params: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Saved random_forest model to saved_models/random_forest_best.pkl


Downloading artifacts: 100%|██████████| 7/7 [00:08<00:00,  1.20s/it]
2025/07/07 07:43:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/576117582094812384.


✅ SHAP saved & logged: shap_outputs\random_forest_shap_summary.png

Training xgboost...
xgboost | MAE: 0.36 | RMSE: 0.66 | R2: 0.942 | MAPE: 0.15% | Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Saved xgboost model to saved_models/xgboost_best.pkl


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  2.90it/s]
2025/07/07 07:45:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/576117582094812384.


✅ SHAP saved & logged: shap_outputs\xgboost_shap_summary.png

Training lightgbm...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1324
[LightGBM] [Info] Number of data points in the train set: 45000, number of used features: 18
[LightGBM] [Info] Start training from score -0.003240
lightgbm | MAE: 0.35 | RMSE: 0.64 | R2: 0.945 | MAPE: 0.14% | Best Params: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Saved lightgbm model to saved_models/lightgbm_best.pkl


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  3.11it/s]

⚠️ SHAP failed for lightgbm: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. This check failed because for one of the samples the sum of the SHAP values was -0.075366, while the model output was -0.079867. If this difference is acceptable you can set check_additivity=False to disable this check.


2025/07/07 07:46:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run lightgbm at: http://localhost:5000/#/experiments/576117582094812384/runs/94823a2bb8c84e6d83db1d94798235e9.
2025/07/07 07:46:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/576117582094812384.



📊 All Model Validation Metrics:
        model       mae      rmse          r2      mape
        ridge 22.239504 33.886179 -153.978895 11.754137
        lasso 21.353578 32.534461 -141.861294 11.319156
random_forest  0.350962  0.652971    0.942454  0.103456
      xgboost  0.362972  0.655032    0.942090  0.151570
     lightgbm  0.353197  0.637282    0.945186  0.141313
