### Outlier Robust Regressors:

1. HuberRegressor
2. QuantileRegressor
3. RANSACRegressor
4. TheilSenRegressor

### HuberRegressor


In [None]:
# Definition:
"""
HuberRegressor:
A robust linear regression model from sklearn.linear_model that uses the Huber loss function.
It combines squared loss for inliers and absolute loss for outliers, making it robust to 
outliers in the dataset.
"""

# Code Example:
from sklearn.linear_model import HuberRegressor

# Initialize HuberRegressor with default or custom parameters
huber_regressor = HuberRegressor(
    epsilon=1.35,       # Controls outlier classification; smaller values are more robust
    max_iter=100,       # Maximum number of iterations for optimization
    alpha=0.0001,       # L2 regularization strength
    warm_start=False,   # Whether to reuse previous fit's coefficients
    fit_intercept=True, # Whether to calculate the intercept for this model
    tol=1e-5            # Tolerance for stopping criteria
)

huber_regressor_hyperparameters = {
    "epsilon": [1.0, 1.35, 1.5, 2.0],          # Controls outlier classification robustness
    "alpha": [0.0001, 0.001, 0.01, 0.1],       # Strength of L2 regularization
    "fit_intercept": [True, False],            # Whether to fit the intercept term
    "max_iter": [100, 500, 1000, 5000],        # Maximum number of iterations
    "tol": [1e-5, 1e-4, 1e-3, 1e-2],           # Convergence tolerance for optimization
    "warm_start": [True, False],               # Whether to reuse previous fit's solution
}


### Quantile Regressor

In [None]:
# Definition:
"""
QuantileRegressor:
A linear regression model from sklearn.linear_model that optimizes the pinball loss for a specified quantile, 
making it robust to outliers. This model uses L1 regularization similar to Lasso regression.
"""

from sklearn.linear_model import QuantileRegressor

quantile_regressor = QuantileRegressor(
    quantile=0.5,            # Predicts the specified quantile (default is 0.5, i.e., median)
    alpha=1.0,               # L1 regularization strength
    fit_intercept=True,      # Whether to calculate the intercept for this model
    solver="highs",          # Optimization method for linear programming
    solver_options=None      # Additional solver-specific options
)

quantile_regressor_hyperparameters = {
    "quantile": [0.1, 0.25, 0.5, 0.75, 0.9],   # Quantiles to predict
    "alpha": [0.1, 0.5, 1.0, 5.0],             # Regularization strength
    "fit_intercept": [True, False],            # Whether to fit the intercept
    "solver": ["highs", "highs-ds", "highs-ipm", "revised simplex"],  # Optimization solver
    "solver_options": [None, {"maxiter": 1000}, {"disp": True}]       # Solver-specific options
}


# "highs"       : General-purpose solver; fast and supports both dense and sparse inputs. Recommended for most cases.
# "highs-ds"    : Dual simplex method; robust for sparse data.
# "highs-ipm"   : Interior point method; efficient for large, dense problems.
# "revised simplex" : Classical simplex algorithm; slower than "highs" but may work better for specific small problems.

# solver_options : A dictionary of additional parameters for solver customization.

# None            : Uses default solver options.
# {"maxiter": 1000}  : Sets the maximum number of iterations for the solver to 1000.
# {"disp": True}     : Enables verbose output to track the solver's progress during optimization.



### RANSACRegressor

In [None]:
# Definition:
"""
RANSACRegressor:
An iterative algorithm for robustly estimating parameters from a subset of inliers from the complete dataset.
RANSAC (Random Sample Consensus) works by repeatedly selecting a random subset of the data, fitting a model, 
and evaluating the number of inliers, which helps reject outliers.
"""


from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression

# Initialize RANSACRegressor with default or custom parameters
ransac_regressor = RANSACRegressor(
    estimator=LinearRegression(),   # Base estimator for model fitting, defaults to LinearRegression
    min_samples=0.5,                # Minimum samples chosen randomly for fitting (as fraction of data)
    residual_threshold=5.0,         # Maximum residual to be considered as an inlier
    max_trials=100,                 # Maximum iterations for random sample selection
    max_skips=np.inf,               # Maximum skipped iterations due to invalid samples
    stop_n_inliers=np.inf,         # Stop if this many inliers are found
    stop_score=np.inf,             # Stop if score exceeds this threshold
    stop_probability=0.99,          # Confidence level for stopping the RANSAC iterations
    loss='absolute_error',          # Loss function for evaluating the error per sample
    random_state=42                 # Random seed for reproducibility
)


# Hyperparameters:
ransac_regressor_hyperparameters = {
    "estimator": [LinearRegression(), None],   # Base estimator for model fitting
    "min_samples": [0.1, 0.5, 1],              # Minimum samples required for fitting
    "residual_threshold": [1.0, 5.0, 10.0],    # Threshold for classifying inliers
    "max_trials": [50, 100, 500],              # Maximum number of iterations
    "max_skips": [np.inf, 10, 50],             # Max skipped iterations due to invalid data
    "stop_n_inliers": [5, 10, np.inf],         # Stop when this many inliers are found
    "stop_score": [0.5, 1.0, np.inf],          # Stop if score exceeds this value
    "stop_probability": [0.95, 0.99, 0.999],   # Confidence for stopping iteration
    "loss": ["absolute_error", "squared_error"], # Loss function for error calculation
    "random_state": [None, 42, 0]              # Random seed for reproducibility
}


### TheilSenRegressor

In [1]:
"""
Theil-Sen Estimator:
A robust multivariate regression model that calculates least squares solutions from subsets of samples.
It can handle outliers and provides robust parameter estimation using the spatial median.
The algorithm computes least squares on a set of subsets of size 'n_subsamples' from the input data.
"""

from sklearn.linear_model import TheilSenRegressor

theil_sen_regressor_model = TheilSenRegressor(
    fit_intercept=True,           # Whether to calculate the intercept
    copy_X=True,                  # Whether to copy input data X
    max_subpopulation=10000,      # Limit the size of subsets if n choose k is too large
    n_subsamples=None,            # Number of subsamples to compute parameters
    max_iter=300,                 # Maximum iterations for calculating spatial median
    tol=1e-3,                     # Tolerance when calculating the spatial median
    random_state=42,              # Random seed for reproducibility
    n_jobs=None,                  # Number of parallel jobs for computation
    verbose=False                 # Whether to print detailed output
)

theil_sen_regressor_hyperparameters = {
    "fit_intercept": [True, False],                 # Whether to calculate intercept
    "copy_X": [True, False],                        # Whether to copy input X
    "max_subpopulation": [10000, 5000, 20000],     # Max size of subsets considered
    "n_subsamples": [None, 10, 20],                # Number of subsamples to compute parameters
    "max_iter": [100, 300, 500],                   # Maximum iterations for median calculation
    "tol": [1e-4, 1e-3, 1e-2],                     # Tolerance for spatial median calculation
    "random_state": [None, 42, 0],                  # Random seed for reproducibility
    "n_jobs": [None, 1, -1],                        # Number of parallel jobs
    "verbose": [False, True]                        # Whether to print detailed output
}
