In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def generate_elastic_net_data(n_samples=100, n_features=50, n_informative=10, n_targets=1,
                              noise=0.1, effective_rank=None, tail_strength=0.5, random_state=42,
                              corr_strength=0.8, sparsity=0.8):
    """
    Generates a synthetic dataset suitable for Elastic Net regularization.

    Parameters:
    - n_samples: int, default=100
        The number of samples.
    - n_features: int, default=50
        The total number of features.
    - n_informative: int, default=10
        The number of informative features, i.e., features used to build the linear model.
    - n_targets: int, default=1
        The number of regression targets, i.e., the dimension of the Y output.
    - noise: float, default=0.1
        The standard deviation of the gaussian noise applied to the output.
    - effective_rank: int or None, default=None
        If not None, the approximate number of singular vectors required to explain most of the data.
        Useful for creating correlated features.
    - tail_strength: float, default=0.5
        The relative importance of the fat noisy tail of the singular values profile if effective_rank is not None.
    - random_state: int or None, default=None
        Determines random number generation for dataset creation.
    - corr_strength: float, default=0.8
        The correlation strength between informative features. Should be between 0 and 1.
    - sparsity: float, default=0.8
        The fraction of coefficients that are zero in the underlying model. Should be between 0 and 1.

    Returns:
    - X: numpy array of shape (n_samples, n_features)
        The input samples.
    - y: numpy array of shape (n_samples,) or (n_samples, n_targets)
        The output values.
    - coef: numpy array of shape (n_features,)
        The true coefficients used to generate the data.
    """

    np.random.seed(random_state)

    # Generate the base regression data
    X, y, coef = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                                 n_targets=n_targets, noise=noise, effective_rank=effective_rank,
                                 tail_strength=tail_strength, coef=True, random_state=random_state)

    # Add correlation between features by creating linear combinations of a few base features
    if corr_strength > 0:
        n_correlated_features = int(n_features * corr_strength)
        base = np.random.randn(n_samples, n_correlated_features)
        correlated_features = np.dot(base, np.random.randn(n_correlated_features, n_features))
        X[:, :n_features] += correlated_features[:, :n_features]

    # Introduce sparsity by zeroing out some coefficients
    zero_indices = np.random.choice(np.arange(n_features), size=int(sparsity * n_features), replace=False)
    coef[zero_indices] = 0

    # Adjust output with the true (sparse) coefficients
    y = np.dot(X, coef) + noise * np.random.randn(n_samples)

    return X, y, coef

# Example usage:

# Set size data
number_of_samples  = 10000
number_of_features = 100

# Automatic setting for other characteristics
fraction_of_informative_features = 0.1
number_of_informative_features   = int(fraction_of_informative_features * number_of_features)
effective_matrix_rank            = int(number_of_features*0.5)

X, y, coef = generate_elastic_net_data(n_samples=number_of_samples, n_features=number_of_features, n_informative=number_of_informative_features,
                                       noise=0.5, effective_rank=effective_matrix_rank, corr_strength=0.7, tail_strength=0.5, sparsity=0.7, random_state=42)

# Convert to DataFrame for better handling (optional)
df_X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
df_y = pd.Series(y, name="target")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

def split_dataset(df_X, df_y, num_test=10000, validation_split=0.2, random_state=123):
    """
    Splits the dataset into training, validation, and test sets.
    """

    # Split out the test set
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        df_X, df_y, test_size=num_test, random_state=random_state)

    # Split the remaining data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_val, y_train_val, test_size=validation_split, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

# Example usage:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(
    df_X, df_y, num_test=2000, validation_split=0.2, random_state=123)

# Check the sizes of the splits
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 6400 samples
Validation set: 1600 samples
Test set: 2000 samples


In [None]:
import pickle

print( X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape, y_test.shape  )

full_data = { "x_train": X_train, "x_val": X_val, "x_test": X_test, "y_train": y_train, "y_val": y_val, "y_test": y_test }

with open('/content/drive/MyDrive/HyperLocal_Tuning/Regression_HLS/Simulation_Dataset/simulation_dataset.pickle', 'wb') as handle:
    pickle.dump( full_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

(6400, 100) (1600, 100) (6400,) (1600,) (2000, 100) (2000,)
