# K-Nearest Neighbors (KNN) Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Function to load the dataset
def load_data(file_path, target_column):
    """
    Load dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): The column to be used as the target.
    
    Returns:
        X (DataFrame): Features.
        y (Series): Target.
    """
    data = pd.read_csv(file_path)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

# Function to set up and perform grid search for KNN
def perform_grid_search(X_train, y_train):
    """
    Set up the GridSearchCV for KNN and find the best parameters.
    
    Parameters:
        X_train (DataFrame): Training features.
        y_train (Series): Training target.
        
    Returns:
        best_knn (KNeighborsRegressor): Best KNN model from the grid search.
    """
    knn = KNeighborsRegressor()
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2]
    }

    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Function to evaluate the model performance
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model using R², RMSE, and standard deviation.
    
    Parameters:
        model (KNeighborsRegressor): Trained model.
        X_test (DataFrame): Test features.
        y_test (Series): Test target.
    
    Returns:
        metrics (dict): Dictionary with R², RMSE, and standard deviation.
    """
    y_pred = model.predict(X_test)
    
    # Performance metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    std_dev = np.std(y_pred)
    
    metrics = {
        "R²": r2,
        "RMSE": rmse,
        "Standard Deviation": std_dev
    }
    return metrics

# Function to print results
def print_results(metrics):
    """
    Print the evaluation metrics.
    
    Parameters:
        metrics (dict): The evaluation metrics to print.
    """
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

# Main function to execute the full process
def main():
    # Load the data
    X, y = load_data('bert_test_2.csv', 'output')
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Perform grid search
    best_knn, best_params = perform_grid_search(X_train, y_train)
    
    # Evaluate the model
    metrics = evaluate_model(best_knn, X_test, y_test)
    
    # Print the results
    print(f"Best Parameters: {best_params}")
    print_results(metrics)

if __name__ == "__main__":
    main()


Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.5s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=distance; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, 

# Polynomial Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load and split the dataset
def load_and_split_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['output'])
    y = data['output']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Create and configure the pipeline
def create_pipeline():
    return Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures()),
        ('linear', LinearRegression())
    ])

# Perform grid search with cross-validation
def grid_search_cv(pipeline, X_train, y_train):
    param_grid = {'poly__degree': [2, 3, 4], 'linear__fit_intercept': [True, False]}
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search

# Evaluate the model on the test set
def evaluate_model(best_model, X_test, y_test):
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    std_dev = np.std(y_pred)
    return rmse, r2, std_dev

# Main function
def main():
    # Load and split data
    X_train, X_test, y_train, y_test = load_and_split_data('bert_test_2.csv')

    # Create pipeline and perform grid search
    pipeline = create_pipeline()
    grid_search = grid_search_cv(pipeline, X_train, y_train)

    # Get the best model and evaluate it
    best_model = grid_search.best_estimator_
    rmse, r2, std_dev = evaluate_model(best_model, X_test, y_test)

    # Print results
    print(f'Best Parameters: {grid_search.best_params_}')
    print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse}')
    print(f'R² value on Test Set: {r2}')
    print(f'Standard Deviation of Predictions: {std_dev}')

if __name__ == "__main__":
    main()


Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  47.0s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.9s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  49.3s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.8s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  42.2s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  41.1s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  41.7s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  46.5s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  43.7s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  45.3s
[CV] END .........linear__fit_intercept=True, poly__degree=3; total time=   0.0s
[CV] END .........linear__fit_intercept=True, po

40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklea

Best Parameters: {'linear__fit_intercept': False, 'poly__degree': 2}
Root Mean Squared Error (RMSE) on Test Set: 2600922748.5203533
R² value on Test Set: -4.415806999659239e+18
Standard Deviation of Predictions: 2599536849.8859825


# Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Function to load the dataset
def load_data(file_path, target_column):
    """
    Load dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): The column to be used as the target.
    
    Returns:
        X (DataFrame): Features.
        y (Series): Target.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to standardize features
def standardize_features(X):
    """
    Standardize features to have zero mean and unit variance.

    Parameters:
        X (DataFrame): Features to be standardized.
    
    Returns:
        X_scaled (ndarray): Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to perform Grid Search for hyperparameter tuning
def perform_grid_search(X_train, y_train):
    """
    Perform Grid Search for hyperparameter tuning of Logistic Regression model.

    Parameters:
        X_train (DataFrame): Training features.
        y_train (Series): Training target.
    
    Returns:
        best_model (LogisticRegression): Best model from the grid search.
        best_params (dict): Best parameters found by Grid Search.
    """
    model = LogisticRegression(max_iter=1000)

    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty': ['l2'],
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    return best_model, best_params

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model using R², RMSE, and standard deviation.

    Parameters:
        model (LogisticRegression): Trained model.
        X_test (DataFrame): Test features.
        y_test (Series): Test target.
    
    Returns:
        metrics (dict): Dictionary containing R², RMSE, and standard deviation.
    """
    y_pred = model.predict(X_test)

    # Calculate R² score
    r2 = r2_score(y_test, y_pred)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Calculate standard deviation of the predictions
    std_dev = np.std(y_pred)

    metrics = {
        "R²": r2,
        "RMSE": rmse,
        "Standard Deviation": std_dev
    }
    
    return metrics

# Function to perform cross-validation and calculate RMSE
def cross_validate_model(model, X_train, y_train):
    """
    Perform cross-validation and calculate the RMSE.

    Parameters:
        model (LogisticRegression): Trained model.
        X_train (DataFrame): Training features.
        y_train (Series): Training target.
    
    Returns:
        cv_rmse (float): Cross-validated RMSE.
    """
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    cv_rmse = np.mean(np.sqrt(-cross_val_scores))
    return cv_rmse

# Function to print evaluation results
def print_results(best_params, metrics, cv_rmse):
    """
    Print the evaluation results.

    Parameters:
        best_params (dict): Best parameters found by Grid Search.
        metrics (dict): Dictionary containing evaluation metrics.
        cv_rmse (float): Cross-validated RMSE.
    """
    print(f"Best Parameters: {best_params}")
    print(f"R² Score: {metrics['R²']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"Standard Deviation of Predictions: {metrics['Standard Deviation']}")
    print(f"Cross-validated RMSE: {cv_rmse}")

# Main function to run the workflow
def main():
    # Load data
    X, y = load_data('bert_test_2.csv', 'output')

    # Standardize features
    X_scaled = standardize_features(X)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Perform Grid Search
    best_model, best_params = perform_grid_search(X_train, y_train)

    # Evaluate the model
    metrics = evaluate_model(best_model, X_test, y_test)

    # Perform cross-validation
    cv_rmse = cross_validate_model(best_model, X_train, y_train)

    # Print results
    print_results(best_params, metrics, cv_rmse)

if __name__ == "__main__":
    main()


Best parameters found:  {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
R^2 Score:  0.4363600810609023
RMSE:  0.929230130100368
Cross-validated RMSE:  0.9688596222791311
Standard Deviation of Predictions: 1.1057434316551316


# Linear Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

# Separate features and target
def separate_features_target(df):
    X = df.drop(columns=['output']).values
    y = df['output'].values
    return X, y

# Normalize data
def normalize_data(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

# Perform GridSearchCV for model fitting
def perform_grid_search(model, X, y, cv):
    grid_search = GridSearchCV(model, {}, cv=cv, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    return grid_search.best_estimator_

# Perform cross-validation and calculate metrics
def evaluate_model(model, X, y, cv):
    mse_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    mse = -mse_scores.mean()  # Convert from negative MSE
    rmse = np.sqrt(mse)       # RMSE
    r2 = r2_scores.mean()     # R²
    std_dev = np.std(mse_scores)  # Standard deviation of MSE
    return mse, rmse, r2, std_dev

# Main function
def main():
    # Load and prepare data
    df = load_data('bert_test_2.csv')
    X, y = separate_features_target(df)
    
    # Normalize data
    X_scaled = normalize_data(X)

    # Initialize model and KFold
    model = LinearRegression()
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # Perform Grid Search for full dataset
    best_model = perform_grid_search(model, X_scaled, y, kf)

    # Sample 500 rows and reprocess
    df_sample = df.sample(n=500, random_state=42)
    X_sample, y_sample = separate_features_target(df_sample)
    X_sample_scaled = normalize_data(X_sample)

    # Perform Grid Search for the sample dataset
    best_model_sample = perform_grid_search(model, X_sample_scaled, y_sample, kf)

    # Evaluate the model on the sample data
    mse_sample, rmse_sample, r2_sample, std_dev_sample = evaluate_model(best_model_sample, X_sample_scaled, y_sample, kf)

    # Output the results
    print(f"MSE: {mse_sample}")
    print(f"RMSE: {rmse_sample}")
    print(f"R²: {r2_sample}")
    print(f"Standard Deviation of MSE: {std_dev_sample}")

if __name__ == "__main__":
    main()


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0  0.445589  0.415333  0.118074  0.202017  0.293542 -0.090634 -0.329375   
1  0.076474 -0.200395  0.185220  0.198175 -0.013897 -0.115137 -0.008887   
2  0.018093 -0.260901  0.498325  0.019231 -0.152991 -0.289522 -0.052139   
3 -0.036732 -0.256312  0.839563  0.082782 -0.107343 -0.260510  0.153343   
4  0.301357  0.222360  0.552620 -0.363340 -0.269042 -0.081171  0.439332   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.540401  0.128381 -0.090013  ...  -0.294877   0.174255  -0.349183   
1  0.263286 -0.105154 -0.296139  ...  -0.068586  -0.040534  -0.186894   
2 -0.038137 -0.218562  0.039249  ...   0.276947  -0.006225  -0.283879   
3 -0.442882 -0.200667  0.136391  ...   0.111054  -0.022571  -0.307118   
4 -0.398349  0.077302  0.175215  ...   0.302719  -0.347563  -0.256054   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0  -0.066742   0.027607  -0.0626

# Lasso Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Function to load the dataset
def load_data(file_path, target_column):
    """
    Load dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): The column to be used as the target.
    
    Returns:
        X (ndarray): Features.
        y (ndarray): Target.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y

# Function to standardize features
def standardize_features(X):
    """
    Standardize features to have zero mean and unit variance.

    Parameters:
        X (ndarray): Features to be standardized.
    
    Returns:
        X_scaled (ndarray): Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to perform Grid Search for hyperparameter tuning
def perform_grid_search(model, X, y, param_grid, cv_splits=10):
    """
    Perform Grid Search for hyperparameter tuning of the model.

    Parameters:
        model: Model to be tuned.
        X (ndarray): Features.
        y (ndarray): Target.
        param_grid (dict): Grid of parameters for tuning.
        cv_splits (int): Number of splits for cross-validation.
    
    Returns:
        best_model (object): Best model from the grid search.
        best_params (dict): Best parameters found by Grid Search.
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    return best_model, best_params

# Function to perform cross-validation and calculate performance metrics
def evaluate_model(model, X, y, cv_splits=10):
    """
    Perform cross-validation and calculate MSE, RMSE, R², and Standard Deviation.

    Parameters:
        model: Trained model.
        X (ndarray): Features.
        y (ndarray): Target.
        cv_splits (int): Number of splits for cross-validation.
    
    Returns:
        metrics (dict): Dictionary of performance metrics (MSE, RMSE, R², Standard Deviation).
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    
    mse = -mse_scores.mean()  # MSE (convert from negative MSE)
    rmse = np.sqrt(mse)       # RMSE
    r2 = r2_scores.mean()     # R²
    std_dev = np.std(mse_scores)  # Standard deviation of MSE

    metrics = {
        "MSE": mse,
        "RMSE": rmse,
        "R²": r2,
        "Standard Deviation of MSE": std_dev
    }
    
    return metrics

# Function to print evaluation results
def print_results(best_params, metrics):
    """
    Print the evaluation results.

    Parameters:
        best_params (dict): Best parameters found by Grid Search.
        metrics (dict): Dictionary containing evaluation metrics.
    """
    print(f"Best alpha (regularization strength): {best_params['alpha']}")
    print(f"MSE: {metrics['MSE']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"R²: {metrics['R²']}")
    print(f"Standard Deviation of MSE: {metrics['Standard Deviation of MSE']}")

# Main function to run the workflow
def main():
    # Load data
    X, y = load_data('bert_test_2.csv', 'output')

    # Standardize features
    X_scaled = standardize_features(X)

    # Initialize Lasso Regression model
    model = Lasso()

    # Perform Grid Search for hyperparameter tuning
    grid_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}
    best_model, best_params = perform_grid_search(model, X_scaled, y, grid_params)

    # Perform evaluation using cross-validation
    metrics = evaluate_model(best_model, X_scaled, y)

    # Print the results
    print_results(best_params, metrics)

if __name__ == "__main__":
    main()


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0  0.445589  0.415333  0.118074  0.202017  0.293542 -0.090634 -0.329375   
1  0.076474 -0.200395  0.185220  0.198175 -0.013897 -0.115137 -0.008887   
2  0.018093 -0.260901  0.498325  0.019231 -0.152991 -0.289522 -0.052139   
3 -0.036732 -0.256312  0.839563  0.082782 -0.107343 -0.260510  0.153343   
4  0.301357  0.222360  0.552620 -0.363340 -0.269042 -0.081171  0.439332   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.540401  0.128381 -0.090013  ...  -0.294877   0.174255  -0.349183   
1  0.263286 -0.105154 -0.296139  ...  -0.068586  -0.040534  -0.186894   
2 -0.038137 -0.218562  0.039249  ...   0.276947  -0.006225  -0.283879   
3 -0.442882 -0.200667  0.136391  ...   0.111054  -0.022571  -0.307118   
4 -0.398349  0.077302  0.175215  ...   0.302719  -0.347563  -0.256054   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0  -0.066742   0.027607  -0.0626

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha (regularization strength): 0.01
MSE: 0.973829476591962
RMSE: 0.9868279873371864
R²: 0.3103738152553472
Standard Deviation of MSE: 0.22928273282252234


  model = cd_fast.enet_coordinate_descent(


# Decision Tree Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Load and prepare data
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    X = df.drop(columns=['output'])
    y = df['output']
    return X, y

# Standardize features
def standardize_features(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
def apply_pca(X_scaled, variance_retained=0.95):
    pca = PCA(n_components=variance_retained)
    return pca.fit_transform(X_scaled)

# Train-test split
def split_data(X_pca, y, test_size=0.2, random_state=42):
    return train_test_split(X_pca, y, test_size=test_size, random_state=random_state)

# RandomizedSearchCV for hyperparameter tuning
def perform_random_search(model, X_train, y_train):
    param_dist = {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': np.arange(2, 10),
        'min_samples_leaf': np.arange(1, 5),
        'max_features': ['auto', 'sqrt', 'log2', None]
    }
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search

# Cross-validation scores
def cross_validate_model(model, X_train, y_train, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    return cross_val_score(model, X_train, y_train, cv=kf)

# Calculate performance metrics
def calculate_metrics(y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return r2, rmse, mae, mean_value, std_dev

# Main function
def main():
    # Load and prepare data
    X, y = load_and_prepare_data('bert_test_2.csv')

    # Standardize features
    X_scaled = standardize_features(X)

    # Apply PCA for dimensionality reduction
    X_pca = apply_pca(X_scaled)

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = split_data(X_pca, y)

    # Initialize and tune model
    model = DecisionTreeRegressor()
    random_search = perform_random_search(model, X_train, y_train)

    # Cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Predictions
    y_pred = random_search.best_estimator_.predict(X_test)

    # Calculate performance metrics
    r2, rmse, mae, mean_value, std_dev = calculate_metrics(y_test, y_pred)

    # Print results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {r2}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"Mean of Output: {mean_value}")
    print(f"Standard Deviation of Output: {std_dev}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

if __name__ == "__main__":
    main()


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None, 'max_depth': 40}
R^2 Score: 0.15949204352875646
RMSE: 1.1347320613997438
MAE: 0.7592250922509225
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [ 0.22924306  0.16139765 -0.03558627 -0.03124128  0.10248387]
Cross-Validation Mean: 0.08525940514233835
Cross-Validation Std Dev: 0.10488192806411245


# Bayesian Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold

# Function to load dataset
def load_data(file_path, target_column):
    """
    Load dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): Column name of the target.
    
    Returns:
        X (ndarray): Features.
        y (ndarray): Target.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to standardize the features
def standardize_features(X):
    """
    Standardize features to have zero mean and unit variance.

    Parameters:
        X (ndarray): Features to be standardized.
    
    Returns:
        X_scaled (ndarray): Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA to retain a certain amount of variance.

    Parameters:
        X_scaled (ndarray): Scaled features.
        variance_retained (float): Proportion of variance to retain.
    
    Returns:
        X_pca (ndarray): Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca

# Function to perform RandomizedSearchCV for hyperparameter tuning
def perform_random_search(model, X_train, y_train, param_dist, n_iter=10, cv_splits=5):
    """
    Perform RandomizedSearchCV for hyperparameter tuning.

    Parameters:
        model: Model to be tuned.
        X_train (ndarray): Training features.
        y_train (ndarray): Training target.
        param_dist (dict): Distribution of parameters to sample from.
        n_iter (int): Number of iterations for RandomizedSearchCV.
        cv_splits (int): Number of splits for cross-validation.
    
    Returns:
        random_search (RandomizedSearchCV): Best model from RandomizedSearchCV.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv_splits, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search

# Function to perform cross-validation
def perform_cross_validation(model, X_train, y_train, cv_splits=5):
    """
    Perform cross-validation and return the scores.

    Parameters:
        model: Model to be evaluated.
        X_train (ndarray): Training features.
        y_train (ndarray): Training target.
        cv_splits (int): Number of splits for cross-validation.
    
    Returns:
        cv_scores (ndarray): Cross-validation scores.
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores

# Function to calculate performance metrics
def calculate_performance_metrics(y_true, y_pred):
    """
    Calculate R², RMSE, MAE, and other metrics.

    Parameters:
        y_true (ndarray): True target values.
        y_pred (ndarray): Predicted target values.
    
    Returns:
        metrics (dict): Dictionary of performance metrics (R², RMSE, MAE, etc.)
    """
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mean_value = np.mean(y_true)
    std_dev = np.std(y_true)
    
    metrics = {
        "R²": r2,
        "RMSE": rmse,
        "MAE": mae,
        "Mean of Output": mean_value,
        "Standard Deviation of Output": std_dev
    }
    
    return metrics

# Function to print the evaluation results
def print_results(best_params, metrics, cv_scores):
    """
    Print the results of the grid search and performance evaluation.

    Parameters:
        best_params (dict): Best parameters found from RandomizedSearchCV.
        metrics (dict): Dictionary of performance metrics.
        cv_scores (ndarray): Cross-validation scores.
    """
    print(f"Best Parameters: {best_params}")
    print(f"R² Score: {metrics['R²']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

# Main function to run the workflow
def main():
    # Load data
    X, y = load_data('bert_test_2.csv', 'output')

    # Standardize features
    X_scaled = standardize_features(X)

    # Apply PCA
    X_pca = apply_pca(X_scaled)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Initialize the model
    model = BayesianRidge()

    # Perform RandomizedSearchCV for hyperparameter tuning
    param_dist = {
        'alpha_1': np.logspace(-6, -1, 6),
        'alpha_2': np.logspace(-6, -1, 6),
        'lambda_1': np.logspace(-6, -1, 6),
        'lambda_2': np.logspace(-6, -1, 6)
    }
    random_search = perform_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = perform_cross_validation(random_search.best_estimator_, X_train, y_train)

    # Predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Calculate performance metrics
    metrics = calculate_performance_metrics(y_test, y_pred)

    # Print results
    print_results(random_search.best_params_, metrics, cv_scores)

if __name__ == "__main__":
    main()


Best Parameters: {'lambda_2': 0.0001, 'lambda_1': 0.1, 'alpha_2': 0.1, 'alpha_1': 0.001}
R^2 Score: 0.46877192408957424
RMSE: 0.9021171128277098
MAE: 0.7487594495353221
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.4049201  0.38162817 0.39868496 0.45921714 0.41678268]
Cross-Validation Mean: 0.4122466075674997
Cross-Validation Std Dev: 0.026084062382817932


# Random Forest Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Function to load dataset
def load_data(file_path, target_column):
    """
    Load dataset and separate features and target.
    
    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): Column name of the target.
    
    Returns:
        X (ndarray): Features.
        y (ndarray): Target.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to standardize the features
def standardize_features(X):
    """
    Standardize features to have zero mean and unit variance.
    
    Parameters:
        X (ndarray): Features to be standardized.
    
    Returns:
        X_scaled (ndarray): Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA to retain a certain amount of variance.
    
    Parameters:
        X_scaled (ndarray): Scaled features.
        variance_retained (float): Proportion of variance to retain.
    
    Returns:
        X_pca (ndarray): Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca

# Function to perform RandomizedSearchCV for hyperparameter tuning
def perform_random_search(model, X_train, y_train, param_dist, n_iter=5, cv_splits=3):
    """
    Perform RandomizedSearchCV for hyperparameter tuning.
    
    Parameters:
        model: Model to be tuned.
        X_train (ndarray): Training features.
        y_train (ndarray): Training target.
        param_dist (dict): Distribution of parameters to sample from.
        n_iter (int): Number of iterations for RandomizedSearchCV.
        cv_splits (int): Number of splits for cross-validation.
    
    Returns:
        random_search (RandomizedSearchCV): Best model from RandomizedSearchCV.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv_splits, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search

# Function to perform cross-validation
def perform_cross_validation(model, X_train, y_train, cv_splits=3):
    """
    Perform cross-validation and return the scores.
    
    Parameters:
        model: Model to be evaluated.
        X_train (ndarray): Training features.
        y_train (ndarray): Training target.
        cv_splits (int): Number of splits for cross-validation.
    
    Returns:
        cv_scores (ndarray): Cross-validation scores.
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores

# Function to calculate performance metrics
def calculate_performance_metrics(y_true, y_pred):
    """
    Calculate R², RMSE, MAE, and other metrics.
    
    Parameters:
        y_true (ndarray): True target values.
        y_pred (ndarray): Predicted target values.
    
    Returns:
        metrics (dict): Dictionary of performance metrics (R², RMSE, MAE, etc.)
    """
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mean_value = np.mean(y_true)
    std_dev = np.std(y_true)
    
    metrics = {
        "R²": r2,
        "RMSE": rmse,
        "MAE": mae,
        "Mean of Output": mean_value,
        "Standard Deviation of Output": std_dev
    }
    
    return metrics

# Function to print the evaluation results
def print_results(best_params, metrics, cv_scores):
    """
    Print the results of the grid search and performance evaluation.
    
    Parameters:
        best_params (dict): Best parameters found from RandomizedSearchCV.
        metrics (dict): Dictionary of performance metrics.
        cv_scores (ndarray): Cross-validation scores.
    """
    print(f"Best Parameters: {best_params}")
    print(f"R² Score: {metrics['R²']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

# Main function to run the workflow
def main():
    # Load data
    X, y = load_data('bert_test_2.csv', 'output')

    # Standardize features
    X_scaled = standardize_features(X)

    # Apply PCA
    X_pca = apply_pca(X_scaled)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Initialize the model
    model = RandomForestRegressor()

    # Perform RandomizedSearchCV for hyperparameter tuning
    param_dist = {
        'n_estimators': [10, 50],  # Reduced number of estimators
        'max_depth': [None, 10],  # Reduced max_depth options
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]  # Keeping only one option for bootstrap
    }
    random_search = perform_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = perform_cross_validation(random_search.best_estimator_, X_train, y_train)

    # Predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Calculate performance metrics
    metrics = calculate_performance_metrics(y_test, y_pred)

    # Print results
    print_results(random_search.best_params_, metrics, cv_scores)

if __name__ == "__main__":
    main()


Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': True}
R^2 Score: 0.5627794695810993
RMSE: 0.8184133118873096
MAE: 0.6304905060089562
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.48035684 0.51561573 0.48915119]
Cross-Validation Mean: 0.4950412526770014
Cross-Validation Std Dev: 0.014984814769876632


## Support vector regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR

# Load and prepare data
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    X = df.drop(columns=['output'])
    y = df['output']
    return X, y

# Standardize features
def standardize_features(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
def apply_pca(X_scaled, variance_retained=0.95):
    pca = PCA(n_components=variance_retained)
    return pca.fit_transform(X_scaled)

# Split data into train and test sets
def split_data(X_pca, y, test_size=0.2, random_state=42):
    return train_test_split(X_pca, y, test_size=test_size, random_state=random_state)

# Perform RandomizedSearchCV for parameter tuning
def perform_random_search(model, X_train, y_train, param_dist):
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search

# Cross-validation
def cross_validate_model(model, X_train, y_train, cv=3):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    return cross_val_score(model, X_train, y_train, cv=kf)

# Calculate performance metrics
def calculate_metrics(y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return r2, rmse, mae, mean_value, std_dev

# Main function
def main():
    # Load and prepare data
    X, y = load_and_prepare_data('bert_test_2.csv')

    # Standardize features
    X_scaled = standardize_features(X)

    # Apply PCA for dimensionality reduction
    X_pca = apply_pca(X_scaled)

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = split_data(X_pca, y)

    # Initialize Support Vector Regression model
    model = SVR()

    # Define the parameter distribution for RandomizedSearchCV
    param_dist = {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': [0.1, 1, 10, 100],
        'epsilon': [0.1, 0.2, 0.5, 0.3],
        'degree': [2, 3],
    }

    # Perform RandomizedSearchCV for hyperparameter tuning
    random_search = perform_random_search(model, X_train, y_train, param_dist)

    # Cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Predictions
    y_pred = random_search.best_estimator_.predict(X_test)

    # Calculate performance metrics
    r2, rmse, mae, mean_value, std_dev = calculate_metrics(y_test, y_pred)

    # Print results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {r2}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"Mean of Output: {mean_value}")
    print(f"Standard Deviation of Output: {std_dev}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

if __name__ == "__main__":
    main()


Best Parameters: {'kernel': 'poly', 'epsilon': 0.5, 'degree': 2, 'C': 100}
R^2 Score: 0.5900439384391127
RMSE: 1.260085560294259
MAE: 1.0188619540208168
Mean of Output: 1.9384615384615385
Standard Deviation of Output: 1.968028479131986
Cross-Validation Scores: [0.30503217 0.30320936 0.32239364]
Cross-Validation Mean: 0.31021172122150475
Cross-Validation Std Dev: 0.00864599882480651


## Gaussian Process Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel

# Load and prepare data
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    X = df.drop(columns=['output'])
    y = df['output']
    return X, y

# Standardize features
def standardize_features(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
def apply_pca(X_scaled, variance_retained=0.95):
    pca = PCA(n_components=variance_retained)
    return pca.fit_transform(X_scaled)

# Split data into train and test sets
def split_data(X_pca, y, test_size=0.2, random_state=42):
    return train_test_split(X_pca, y, test_size=test_size, random_state=random_state)

# Initialize and perform RandomizedSearchCV for hyperparameter tuning
def perform_random_search(model, X_train, y_train, param_dist):
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search

# Cross-validation
def cross_validate_model(model, X_train, y_train, cv=3):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    return cross_val_score(model, X_train, y_train, cv=kf)

# Calculate performance metrics
def calculate_metrics(y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return r2, rmse, mae, mean_value, std_dev

# Main function
def main():
    # Load and prepare data
    X, y = load_and_prepare_data('bert_test_2.csv')

    # Standardize features
    X_scaled = standardize_features(X)

    # Apply PCA for dimensionality reduction
    X_pca = apply_pca(X_scaled)

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = split_data(X_pca, y)

    # Initialize Gaussian Process Regression model
    kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + WhiteKernel()
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)

    # Define the parameter distribution for RandomizedSearchCV
    param_dist = {
        'alpha': [1e-10, 1e-5, 1e-2],
        'n_restarts_optimizer': [5, 10, 15],
    }

    # Perform RandomizedSearchCV for hyperparameter tuning
    random_search = perform_random_search(model, X_train, y_train, param_dist)

    # Cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Predictions
    y_pred = random_search.best_estimator_.predict(X_test)

    # Calculate performance metrics
    r2, rmse, mae, mean_value, std_dev = calculate_metrics(y_test, y_pred)

    # Print results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {r2}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"Mean of Output: {mean_value}")
    print(f"Standard Deviation of Output: {std_dev}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

if __name__ == "__main__":
    main()




Best Parameters: {'n_restarts_optimizer': 5, 'alpha': 1e-10}
R^2 Score: 0.5568212376239063
RMSE: 1.3101494803846034
MAE: 0.9680913423899548
Mean of Output: 1.9384615384615385
Standard Deviation of Output: 1.968028479131986
Cross-Validation Scores: [0.4084499  0.22946832 0.26207531]
Cross-Validation Mean: 0.29999784492345233
Cross-Validation Std Dev: 0.07783397165422182
