# Linear Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error

# Function to load and inspect the dataset
def load_and_inspect_data(file_path, target_column):
    """
    Loads the dataset and displays basic information.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): Name of the target column.

    Returns:
        X (numpy.ndarray): Features.
        y (numpy.ndarray): Target.
    """
    df = pd.read_csv(file_path)
    print("Dataset Preview:")
    print(df.head())
    print("\nColumns:", df.columns)
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y

# Function to preprocess the features
def preprocess_features(X):
    """
    Scales the features using StandardScaler.

    Parameters:
        X (numpy.ndarray): Features to be scaled.

    Returns:
        X_scaled (numpy.ndarray): Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to train the model using GridSearchCV
def train_model(X, y, model, param_grid, cv_splits):
    """
    Trains the model using GridSearchCV with cross-validation.

    Parameters:
        X (numpy.ndarray): Features.
        y (numpy.ndarray): Target.
        model: Machine learning model to train.
        param_grid (dict): Hyperparameter grid for GridSearchCV.
        cv_splits (int): Number of cross-validation splits.

    Returns:
        best_model: The best estimator found by GridSearchCV.
        grid_search: The fitted GridSearchCV object.
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    return best_model, grid_search

# Function to evaluate the model
def evaluate_model(model, X, y, cv_splits):
    """
    Evaluates the model using cross-validation.

    Parameters:
        model: Trained machine learning model.
        X (numpy.ndarray): Features.
        y (numpy.ndarray): Target.
        cv_splits (int): Number of cross-validation splits.

    Returns:
        dict: Evaluation metrics (MSE, RMSE, R², Std Dev of MSE).
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    mse = -mse_scores.mean()
    rmse = np.sqrt(mse)
    r2 = r2_scores.mean()
    std_dev = np.std(mse_scores)
    return {"MSE": mse, "RMSE": rmse, "R²": r2, "Std Dev of MSE": std_dev}

# Main script
if __name__ == "__main__":
    # Parameters
    file_path = 'roberta_test_2.csv'
    target_column = 'output'
    param_grid = {}  # No hyperparameters for LinearRegression
    cv_splits = 10

    # Load and preprocess data
    X, y = load_and_inspect_data(file_path, target_column)
    X_scaled = preprocess_features(X)

    # Train the model
    model = LinearRegression()
    best_model, grid_search = train_model(X_scaled, y, model, param_grid, cv_splits)

    # Reduce dataset size for faster evaluation
    df_sample = pd.DataFrame(X_scaled).sample(n=500, random_state=42)
    X_sample = df_sample.values
    y_sample = y[:500]  # Corresponding targets
    X_sample_scaled = preprocess_features(X_sample)

    # Re-train and evaluate on the sample
    best_model_sample, _ = train_model(X_sample_scaled, y_sample, model, param_grid, cv_splits)
    metrics = evaluate_model(best_model_sample, X_sample_scaled, y_sample, cv_splits)

    # Display metrics
    print("\nEvaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

# Lasso Regression

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error

# Function to load and inspect the dataset
def load_and_inspect_data(file_path, target_column):
    """
    Loads the dataset and separates features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): Name of the target column.

    Returns:
        X (numpy.ndarray): Features.
        y (numpy.ndarray): Target.
    """
    df = pd.read_csv(file_path)
    print("Dataset Preview:")
    print(df.head())
    print("\nColumns:", df.columns)
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y, df

# Function to preprocess the features
def preprocess_features(X):
    """
    Scales the features using StandardScaler.

    Parameters:
        X (numpy.ndarray): Features to be scaled.

    Returns:
        X_scaled (numpy.ndarray): Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to perform Grid Search for hyperparameter tuning
def tune_model(X, y, model, param_grid, cv_splits):
    """
    Tunes the model using GridSearchCV with cross-validation.

    Parameters:
        X (numpy.ndarray): Features.
        y (numpy.ndarray): Target.
        model: Machine learning model to tune.
        param_grid (dict): Hyperparameter grid for GridSearchCV.
        cv_splits (int): Number of cross-validation splits.

    Returns:
        best_model: The best estimator found by GridSearchCV.
        grid_search: The fitted GridSearchCV object.
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    best_model = grid_search.best_estimator_
    return best_model, grid_search

# Function to evaluate the model
def evaluate_model(model, X, y, cv_splits):
    """
    Evaluates the model using cross-validation.

    Parameters:
        model: Trained machine learning model.
        X (numpy.ndarray): Features.
        y (numpy.ndarray): Target.
        cv_splits (int): Number of cross-validation splits.

    Returns:
        dict: Evaluation metrics (MSE, RMSE, R², Std Dev of MSE).
    """
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=42)
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    mse = -mse_scores.mean()
    rmse = np.sqrt(mse)
    r2 = r2_scores.mean()
    std_dev = np.std(mse_scores)
    return {"MSE": mse, "RMSE": rmse, "R²": r2, "Std Dev of MSE": std_dev}

# Main script
if __name__ == "__main__":
    # Parameters
    file_path = 'roberta_test_2.csv'
    target_column = 'output'
    param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}  # Regularization strength
    cv_splits = 10

    # Load and preprocess data
    X, y, df = load_and_inspect_data(file_path, target_column)
    X_scaled = preprocess_features(X)

    # Train the Lasso model
    model = Lasso()
    best_model, grid_search = tune_model(X_scaled, y, model, param_grid, cv_splits)

    # Sample data for faster computation
    df_sample = df.sample(n=500, random_state=42)
    X_sample = df_sample.drop(columns=[target_column]).values
    y_sample = df_sample[target_column].values
    X_sample_scaled = preprocess_features(X_sample)

    # Re-tune and evaluate on the sample
    best_model_sample, _ = tune_model(X_sample_scaled, y_sample, model, param_grid, cv_splits)
    metrics = evaluate_model(best_model_sample, X_sample_scaled, y_sample, cv_splits)

    # Display the results
    print(f"Best alpha (regularization strength): {grid_search.best_params_['alpha']}")
    print("\nEvaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha (regularization strength): 0.01
MSE: 0.9328511603607741
RMSE: 0.9658422026194414
R²: 0.33879625535420954
Standard Deviation of MSE: 0.15535334313966961


# KNN

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Loads the dataset and separates features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): Name of the target column.

    Returns:
        X (DataFrame): Features.
        y (Series): Target.
    """
    data = pd.read_csv(file_path)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

# Function to scale features
def scale_features(X_train, X_test):
    """
    Scales training and testing features using StandardScaler.

    Parameters:
        X_train (DataFrame): Training features.
        X_test (DataFrame): Testing features.

    Returns:
        X_train_scaled (ndarray): Scaled training features.
        X_test_scaled (ndarray): Scaled testing features.
        scaler (StandardScaler): The scaler object for future use.
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

# Function to perform grid search
def perform_grid_search(X_train, y_train, model, param_grid, cv=10):
    """
    Performs grid search with cross-validation to find the best parameters.

    Parameters:
        X_train (ndarray): Training features.
        y_train (Series): Training target.
        model: The machine learning model.
        param_grid (dict): Hyperparameter grid for grid search.
        cv (int): Number of cross-validation folds.

    Returns:
        grid_search: The fitted GridSearchCV object.
    """
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=3)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model on the test set.

    Parameters:
        model: Trained model.
        X_test (ndarray): Testing features.
        y_test (Series): Testing target.

    Returns:
        dict: A dictionary containing evaluation metrics.
    """
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    std_dev = np.std(y_pred)
    return {"MSE": mse, "RMSE": rmse, "R²": r2, "Std Dev": std_dev}

# Function to perform cross-validation
def cross_validate_model(model, X, y, cv=10):
    """
    Performs cross-validation and calculates MSE and RMSE.

    Parameters:
        model: Trained model.
        X (ndarray): Features.
        y (Series): Target.
        cv (int): Number of cross-validation folds.

    Returns:
        dict: A dictionary containing cross-validation metrics.
    """
    mse_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-mse_scores)
    return {"CV MSE": mse_scores, "CV RMSE": rmse_scores}

# Main script
if __name__ == "__main__":
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load dataset
    X, y = load_dataset(file_path, target_column)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale features
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

    # Define KNN model and parameter grid
    knn = KNeighborsRegressor()
    param_grid = {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2]
    }

    # Perform grid search
    grid_search = perform_grid_search(X_train_scaled, y_train, knn, param_grid)

    # Get the best model
    best_knn = grid_search.best_estimator_

    # Evaluate the model on the test set
    test_metrics = evaluate_model(best_knn, X_test_scaled, y_test)
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Test Metrics: {test_metrics}")

    # Perform cross-validation on the entire dataset
    X_scaled = scaler.fit_transform(X)
    cv_metrics = cross_validate_model(best_knn, X_scaled, y)

    print(f"Cross-Validation MSE: {cv_metrics['CV MSE'].mean():.4f} ± {cv_metrics['CV MSE'].std():.4f}")
    print(f"Cross-Validation RMSE: {cv_metrics['CV RMSE'].mean():.4f} ± {cv_metrics['CV RMSE'].std():.4f}")


Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV 1/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.767 total time=   0.4s
[CV 2/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.820 total time=   0.0s
[CV 3/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.942 total time=   0.0s
[CV 4/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-1.047 total time=   0.0s
[CV 5/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-1.235 total time=   0.0s
[CV 6/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.761 total time=   0.1s
[CV 7/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.722 total time=   0.0s
[CV 8/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.711 total time=   0.0s
[CV 9/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.750 total time=   0.0s
[CV 10/10] END algorithm=auto, n_neig

# Polynomial Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Loads the dataset and separates features and target.

    Parameters:
        file_path (str): Path to the dataset.
        target_column (str): Name of the target column.

    Returns:
        X (DataFrame): Features.
        y (Series): Target.
    """
    data = pd.read_csv(file_path)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

# Function to set up the polynomial regression pipeline
def create_pipeline():
    """
    Creates a pipeline with scaling, polynomial feature transformation, and linear regression.

    Returns:
        Pipeline: The configured pipeline.
    """
    return Pipeline([
        ('scaler', StandardScaler()),
        ('poly', PolynomialFeatures()),
        ('linear', LinearRegression())
    ])

# Function to perform grid search
def perform_grid_search(pipeline, X_train, y_train, param_grid, cv):
    """
    Performs grid search with cross-validation to find the best parameters.

    Parameters:
        pipeline (Pipeline): The pipeline for polynomial regression.
        X_train (DataFrame): Training features.
        y_train (Series): Training target.
        param_grid (dict): Hyperparameter grid for grid search.
        cv (int or cross-validation generator): Cross-validation strategy.

    Returns:
        grid_search (GridSearchCV): The fitted grid search object.
    """
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model on the test set.

    Parameters:
        model: Trained model.
        X_test (DataFrame): Testing features.
        y_test (Series): Testing target.

    Returns:
        dict: A dictionary containing evaluation metrics.
    """
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    std_dev = np.std(y_pred)
    return {"RMSE": rmse, "R²": r2, "Std Dev": std_dev}

# Main script
if __name__ == "__main__":
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load dataset
    X, y = load_dataset(file_path, target_column)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create pipeline
    pipeline = create_pipeline()

    # Define parameter grid
    param_grid = {
        'poly__degree': [2, 3, 4],
        'linear__fit_intercept': [True, False]
    }

    # Set up cross-validation strategy
    cv = KFold(n_splits=10, shuffle=True, random_state=42)

    # Perform grid search
    grid_search = perform_grid_search(pipeline, X_train, y_train, param_grid, cv)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate the model
    test_metrics = evaluate_model(best_model, X_test, y_test)

    # Print results
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Test Metrics: {test_metrics}")


Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time= 1.1min
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  58.9s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  55.1s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  55.9s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  55.2s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  47.5s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  48.6s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  47.4s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  52.3s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.4s
[CV] END .........linear__fit_intercept=True, poly__degree=3; total time=   0.0s
[CV] END .........linear__fit_intercept=True, po

40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Best Parameters: {'linear__fit_intercept': True, 'poly__degree': 2}
Root Mean Squared Error (RMSE) on Test Set: 77570434032.98264
R² value on Test Set: -3.927784212579305e+21
Standard Deviation of Predictions: 76796804149.78


# Logistic Regression

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load dataset and split into features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to preprocess the data
def preprocess_data(X, y, test_size=0.2, random_state=42):
    """
    Scale the features and split into training and testing sets.

    Parameters:
        X (DataFrame): Features.
        y (Series): Target variable.
        test_size (float): Test set proportion.
        random_state (int): Random seed.

    Returns:
        Tuple: Scaled and split data (X_train, X_test, y_train, y_test).
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)

# Function to perform grid search
def perform_grid_search(model, param_grid, X_train, y_train, cv=10):
    """
    Perform grid search for hyperparameter tuning.

    Parameters:
        model: The machine learning model to tune.
        param_grid (dict): Hyperparameter grid.
        X_train (array): Training features.
        y_train (array): Training target.
        cv (int): Number of cross-validation folds.

    Returns:
        GridSearchCV: Fitted grid search object.
    """
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate the model
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on the test set.

    Parameters:
        model: Trained model.
        X_test (array): Test features.
        y_test (array): Test target.

    Returns:
        dict: Evaluation metrics (R², RMSE, standard deviation).
    """
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    std_dev = np.std(y_pred)
    return {"R²": r2, "RMSE": rmse, "Std Dev": std_dev}

# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load and preprocess the dataset
    X, y = load_dataset(file_path, target_column)
    X_train, X_test, y_train, y_test = preprocess_data(X, y)

    # Define the Logistic Regression model and parameter grid
    model = LogisticRegression(max_iter=1000)
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty': ['l2'],
    }

    # Perform grid search
    grid_search = perform_grid_search(model, param_grid, X_train, y_train)
    best_model = grid_search.best_estimator_

    # Display best parameters
    print("Best Parameters: ", grid_search.best_params_)

    # Evaluate the model on the test set
    test_metrics = evaluate_model(best_model, X_test, y_test)
    print("Test Metrics: ", test_metrics)

    # Perform cross-validation
    cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    cv_rmse = np.mean(np.sqrt(-cross_val_scores))
    print("Cross-validated RMSE: ", cv_rmse)

if __name__ == "__main__":
    main()


Best parameters found:  {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
R^2 Score:  0.352054964980268
RMSE:  0.9963031296745293
Cross-validated RMSE:  0.9779680670303673
Standard Deviation of Predictions: 1.1961071190749342


# Linear Regression

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load the dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y

# Function to preprocess the data
def preprocess_data(X, y, test_size=0.2, random_state=42):
    """
    Standardize the features.

    Parameters:
        X (array): Features.
        y (array): Target variable.

    Returns:
        array: Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to perform grid search (though no hyperparameters for basic LinearRegression)
def perform_grid_search(model, X_scaled, y, cv=10):
    """
    Perform grid search for model tuning (even though no hyperparameters for basic LinearRegression).

    Parameters:
        model: The machine learning model to tune.
        X_scaled (array): Scaled features.
        y (array): Target variable.
        cv (int): Number of cross-validation folds.

    Returns:
        GridSearchCV: Fitted grid search object.
    """
    grid_params = {}  # No hyperparameters for basic LinearRegression
    grid_search = GridSearchCV(model, grid_params, cv=cv, scoring='neg_mean_squared_error')
    grid_search.fit(X_scaled, y)
    return grid_search

# Function to evaluate the model with cross-validation
def evaluate_model_with_cv(model, X_scaled, y, cv=10):
    """
    Evaluate the model using cross-validation.

    Parameters:
        model: The trained model.
        X_scaled (array): Scaled features.
        y (array): Target variable.
        cv (int): Number of cross-validation folds.

    Returns:
        dict: Evaluation metrics (MSE, RMSE, R², standard deviation of MSE).
    """
    mse_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='r2')
    
    mse = -mse_scores.mean()  # Convert negative MSE to positive
    rmse = np.sqrt(mse)       # RMSE
    r2 = r2_scores.mean()     # R²
    std_dev = np.std(mse_scores)  # Standard deviation of MSE
    
    return {"MSE": mse, "RMSE": rmse, "R²": r2, "Std Dev": std_dev}

# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load and preprocess the dataset
    X, y = load_dataset(file_path, target_column)
    X_scaled = preprocess_data(X, y)

    # Initialize the Linear Regression model
    model = LinearRegression()

    # Perform grid search (though no hyperparameters for LinearRegression)
    grid_search = perform_grid_search(model, X_scaled, y)
    best_model = grid_search.best_estimator_

    # Evaluate the model with cross-validation
    metrics = evaluate_model_with_cv(best_model, X_scaled, y)
    print("Cross-Validation Metrics: ", metrics)

    # For a sample evaluation (optional for reduced dataset size)
    df_sample = pd.read_csv(file_path).sample(n=500, random_state=42)
    X_sample = df_sample.drop(columns=[target_column]).values
    y_sample = df_sample[target_column].values
    X_sample_scaled = StandardScaler().fit_transform(X_sample)

    # Perform evaluation on the sample
    metrics_sample = evaluate_model_with_cv(best_model, X_sample_scaled, y_sample)
    print("Sample Cross-Validation Metrics: ", metrics_sample)

if __name__ == "__main__":
    main()


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

# Decision Tree Regression

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_csv('roberta_test_2.csv')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Decision Tree Regression model
model = DecisionTreeRegressor()

# Parameter tuning using RandomizedSearchCV
param_dist = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 5),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load the dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to preprocess the data (standardize the features)
def preprocess_data(X):
    """
    Standardize the features using StandardScaler.

    Parameters:
        X (DataFrame): Features.

    Returns:
        array: Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA for dimension reduction, retaining the specified variance.

    Parameters:
        X_scaled (array): Scaled features.
        variance_retained (float): Percentage of variance to retain (default 95%).

    Returns:
        array: Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca

# Function to perform randomized search for hyperparameter tuning
def tune_model_with_random_search(model, X_train, y_train, param_dist, n_iter=10, cv=5):
    """
    Perform RandomizedSearchCV for model hyperparameter tuning.

    Parameters:
        model: The machine learning model to tune.
        X_train (array): Training features.
        y_train (array): Training target.
        param_dist (dict): Hyperparameter distribution for the model.
        n_iter (int): Number of iterations for RandomizedSearchCV (default 10).
        cv (int): Number of cross-validation folds (default 5).

    Returns:
        RandomizedSearchCV: Fitted RandomizedSearchCV object.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search

# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Calculate and return model evaluation metrics.

    Parameters:
        y_test (array): Actual target values.
        y_pred (array): Predicted target values.

    Returns:
        dict: Dictionary containing R², RMSE, MAE, mean value, and standard deviation of the output.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return {"R^2": r2, "RMSE": rmse, "MAE": mae, "Mean of Output": mean_value, "Standard Deviation of Output": std_dev}

# Function to perform cross-validation
def cross_validate_model(model, X_train, y_train, cv=5):
    """
    Perform cross-validation and return the scores.

    Parameters:
        model: The trained model.
        X_train (array): Training features.
        y_train (array): Training target.
        cv (int): Number of cross-validation folds (default 5).

    Returns:
        array: Cross-validation scores.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores

# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load the dataset
    X, y = load_dataset(file_path, target_column)

    # Preprocess the data (standardization)
    X_scaled = preprocess_data(X)

    # Apply PCA for dimension reduction
    X_pca = apply_pca(X_scaled)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Initialize the Decision Tree model
    model = DecisionTreeRegressor()

    # Define parameter grid for RandomizedSearchCV
    param_dist = {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': np.arange(2, 10),
        'min_samples_leaf': np.arange(1, 5),
        'max_features': ['auto', 'sqrt', 'log2', None]
    }

    # Perform RandomizedSearchCV for model hyperparameter tuning
    random_search = tune_model_with_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Make predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Evaluate model performance
    metrics = evaluate_model(y_test, y_pred)

    # Output the results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {metrics['R^2']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

if __name__ == "__main__":
    main()

mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")

Best Parameters: {'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20}
R^2 Score: 0.27132947089194637
RMSE: 1.0565450358948874
MAE: 0.682410824108241
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [-0.06341399 -0.05463411  0.03253953 -0.08660526 -0.13790409]
Cross-Validation Mean: -0.062003585403908226
Cross-Validation Std Dev: 0.055427119347098035


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise Inva

# Bayesian Regression

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import BayesianRidge


# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load the dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y


# Function to preprocess the data (standardize the features)
def preprocess_data(X):
    """
    Standardize the features using StandardScaler.

    Parameters:
        X (DataFrame): Features.

    Returns:
        array: Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled


# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA for dimension reduction, retaining the specified variance.

    Parameters:
        X_scaled (array): Scaled features.
        variance_retained (float): Percentage of variance to retain (default 95%).

    Returns:
        array: Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca


# Function to perform randomized search for hyperparameter tuning
def tune_model_with_random_search(model, X_train, y_train, param_dist, n_iter=10, cv=5):
    """
    Perform RandomizedSearchCV for model hyperparameter tuning.

    Parameters:
        model: The machine learning model to tune.
        X_train (array): Training features.
        y_train (array): Training target.
        param_dist (dict): Hyperparameter distribution for the model.
        n_iter (int): Number of iterations for RandomizedSearchCV (default 10).
        cv (int): Number of cross-validation folds (default 5).

    Returns:
        RandomizedSearchCV: Fitted RandomizedSearchCV object.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search


# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Calculate and return model evaluation metrics.

    Parameters:
        y_test (array): Actual target values.
        y_pred (array): Predicted target values.

    Returns:
        dict: Dictionary containing R², RMSE, MAE, mean value, and standard deviation of the output.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return {"R^2": r2, "RMSE": rmse, "MAE": mae, "Mean of Output": mean_value, "Standard Deviation of Output": std_dev}


# Function to perform cross-validation
def cross_validate_model(model, X_train, y_train, cv=5):
    """
    Perform cross-validation and return the scores.

    Parameters:
        model: The trained model.
        X_train (array): Training features.
        y_train (array): Training target.
        cv (int): Number of cross-validation folds (default 5).

    Returns:
        array: Cross-validation scores.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores


# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load the dataset
    X, y = load_dataset(file_path, target_column)

    # Preprocess the data (standardization)
    X_scaled = preprocess_data(X)

    # Apply PCA for dimension reduction
    X_pca = apply_pca(X_scaled)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Initialize the Bayesian Ridge model
    model = BayesianRidge()

    # Define parameter grid for RandomizedSearchCV
    param_dist = {
        'alpha_1': np.logspace(-6, -1, 6),
        'alpha_2': np.logspace(-6, -1, 6),
        'lambda_1': np.logspace(-6, -1, 6),
        'lambda_2': np.logspace(-6, -1, 6)
    }

    # Perform RandomizedSearchCV for model hyperparameter tuning
    random_search = tune_model_with_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Make predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Evaluate model performance
    metrics = evaluate_model(y_test, y_pred)

    # Output the results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {metrics['R^2']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")

if __name__ == "__main__":
    main()


Best Parameters: {'lambda_2': 0.0001, 'lambda_1': 0.1, 'alpha_2': 0.1, 'alpha_1': 0.001}
R^2 Score: 0.43316623718084224
RMSE: 0.931859134434971
MAE: 0.7654710621707257
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.39647098 0.37840157 0.45102912 0.44629659 0.41795508]
Cross-Validation Mean: 0.4180306685016288
Cross-Validation Std Dev: 0.028011251967096144


# Random Forest Regression

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor


# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load the dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y


# Function to preprocess the data (standardize the features)
def preprocess_data(X):
    """
    Standardize the features using StandardScaler.

    Parameters:
        X (DataFrame): Features.

    Returns:
        array: Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled


# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA for dimension reduction, retaining the specified variance.

    Parameters:
        X_scaled (array): Scaled features.
        variance_retained (float): Percentage of variance to retain (default 95%).

    Returns:
        array: Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca


# Function to perform randomized search for hyperparameter tuning
def tune_model_with_random_search(model, X_train, y_train, param_dist, n_iter=5, cv=3):
    """
    Perform RandomizedSearchCV for model hyperparameter tuning.

    Parameters:
        model: The machine learning model to tune.
        X_train (array): Training features.
        y_train (array): Training target.
        param_dist (dict): Hyperparameter distribution for the model.
        n_iter (int): Number of iterations for RandomizedSearchCV (default 5).
        cv (int): Number of cross-validation folds (default 3).

    Returns:
        RandomizedSearchCV: Fitted RandomizedSearchCV object.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search


# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Calculate and return model evaluation metrics.

    Parameters:
        y_test (array): Actual target values.
        y_pred (array): Predicted target values.

    Returns:
        dict: Dictionary containing R², RMSE, MAE, mean value, and standard deviation of the output.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return {"R^2": r2, "RMSE": rmse, "MAE": mae, "Mean of Output": mean_value, "Standard Deviation of Output": std_dev}


# Function to perform cross-validation
def cross_validate_model(model, X_train, y_train, cv=3):
    """
    Perform cross-validation and return the scores.

    Parameters:
        model: The trained model.
        X_train (array): Training features.
        y_train (array): Training target.
        cv (int): Number of cross-validation folds (default 3).

    Returns:
        array: Cross-validation scores.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores


# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load the dataset
    X, y = load_dataset(file_path, target_column)

    # Preprocess the data (standardization)
    X_scaled = preprocess_data(X)

    # Apply PCA for dimension reduction
    X_pca = apply_pca(X_scaled)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Initialize the Random Forest model
    model = RandomForestRegressor()

    # Define parameter grid for RandomizedSearchCV
    param_dist = {
        'n_estimators': [10, 50],  # Reduced number of estimators
        'max_depth': [None, 10],  # Reduced max_depth options
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]  # Keeping only one option for bootstrap
    }

    # Perform RandomizedSearchCV for model hyperparameter tuning
    random_search = tune_model_with_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Make predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Evaluate model performance
    metrics = evaluate_model(y_test, y_pred)

    # Output the results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {metrics['R^2']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")


if __name__ == "__main__":
    main()


Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}
R^2 Score: 0.5354746999113791
RMSE: 0.8435815984811348
MAE: 0.6583421484214842
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.46013964 0.49983669 0.44386573]
Cross-Validation Mean: 0.46794735299501705
Cross-Validation Std Dev: 0.023507547769574297


## Gaussian Process Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel


# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load the dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y


# Function to preprocess the data (standardize the features)
def preprocess_data(X):
    """
    Standardize the features using StandardScaler.

    Parameters:
        X (DataFrame): Features.

    Returns:
        array: Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled


# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA for dimension reduction, retaining the specified variance.

    Parameters:
        X_scaled (array): Scaled features.
        variance_retained (float): Percentage of variance to retain (default 95%).

    Returns:
        array: Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca


# Function to perform randomized search for hyperparameter tuning
def tune_model_with_random_search(model, X_train, y_train, param_dist, n_iter=5, cv=3):
    """
    Perform RandomizedSearchCV for model hyperparameter tuning.

    Parameters:
        model: The machine learning model to tune.
        X_train (array): Training features.
        y_train (array): Training target.
        param_dist (dict): Hyperparameter distribution for the model.
        n_iter (int): Number of iterations for RandomizedSearchCV (default 5).
        cv (int): Number of cross-validation folds (default 3).

    Returns:
        RandomizedSearchCV: Fitted RandomizedSearchCV object.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search


# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Calculate and return model evaluation metrics.

    Parameters:
        y_test (array): Actual target values.
        y_pred (array): Predicted target values.

    Returns:
        dict: Dictionary containing R², RMSE, MAE, mean value, and standard deviation of the output.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return {"R^2": r2, "RMSE": rmse, "MAE": mae, "Mean of Output": mean_value, "Standard Deviation of Output": std_dev}


# Function to perform cross-validation
def cross_validate_model(model, X_train, y_train, cv=3):
    """
    Perform cross-validation and return the scores.

    Parameters:
        model: The trained model.
        X_train (array): Training features.
        y_train (array): Training target.
        cv (int): Number of cross-validation folds (default 3).

    Returns:
        array: Cross-validation scores.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores


# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load the dataset
    X, y = load_dataset(file_path, target_column)

    # Preprocess the data (standardization)
    X_scaled = preprocess_data(X)

    # Apply PCA for dimension reduction
    X_pca = apply_pca(X_scaled)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Gaussian Process Regression model with kernel
    kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + WhiteKernel()
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)

    # Define parameter grid for RandomizedSearchCV
    param_dist = {
        'alpha': [1e-10, 1e-5, 1e-2],  # Noise level
        'n_restarts_optimizer': [5, 10, 15],  # Number of restarts for optimizer
    }

    # Perform RandomizedSearchCV for model hyperparameter tuning
    random_search = tune_model_with_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Make predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Evaluate model performance
    metrics = evaluate_model(y_test, y_pred)

    # Output the results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {metrics['R^2']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")


if __name__ == "__main__":
    main()




Best Parameters: {'n_restarts_optimizer': 15, 'alpha': 0.01}
R^2 Score: 0.5139741128828532
RMSE: 1.3720220137955668
MAE: 1.0169708484273436
Mean of Output: 1.9384615384615385
Standard Deviation of Output: 1.968028479131986
Cross-Validation Scores: [0.26014635 0.30029679 0.15823068]
Cross-Validation Mean: 0.23955793927816293
Cross-Validation Std Dev: 0.05979747276779415




## Support vector regression

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR

# Function to load the dataset
def load_dataset(file_path, target_column):
    """
    Load the dataset and separate features and target.

    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        DataFrame, Series: Features (X) and target (y).
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y


# Function to preprocess the data (standardize the features)
def preprocess_data(X):
    """
    Standardize the features using StandardScaler.

    Parameters:
        X (DataFrame): Features.

    Returns:
        array: Scaled features.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled


# Function to apply PCA for dimension reduction
def apply_pca(X_scaled, variance_retained=0.95):
    """
    Apply PCA for dimension reduction, retaining the specified variance.

    Parameters:
        X_scaled (array): Scaled features.
        variance_retained (float): Percentage of variance to retain (default 95%).

    Returns:
        array: Transformed features after PCA.
    """
    pca = PCA(n_components=variance_retained)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca


# Function to perform randomized search for hyperparameter tuning
def tune_model_with_random_search(model, X_train, y_train, param_dist, n_iter=5, cv=3):
    """
    Perform RandomizedSearchCV for model hyperparameter tuning.

    Parameters:
        model: The machine learning model to tune.
        X_train (array): Training features.
        y_train (array): Training target.
        param_dist (dict): Hyperparameter distribution for the model.
        n_iter (int): Number of iterations for RandomizedSearchCV (default 5).
        cv (int): Number of cross-validation folds (default 3).

    Returns:
        RandomizedSearchCV: Fitted RandomizedSearchCV object.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter, cv=cv, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search


# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Calculate and return model evaluation metrics.

    Parameters:
        y_test (array): Actual target values.
        y_pred (array): Predicted target values.

    Returns:
        dict: Dictionary containing R², RMSE, MAE, mean value, and standard deviation of the output.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    return {"R^2": r2, "RMSE": rmse, "MAE": mae, "Mean of Output": mean_value, "Standard Deviation of Output": std_dev}


# Function to perform cross-validation
def cross_validate_model(model, X_train, y_train, cv=3):
    """
    Perform cross-validation and return the scores.

    Parameters:
        model: The trained model.
        X_train (array): Training features.
        y_train (array): Training target.
        cv (int): Number of cross-validation folds (default 3).

    Returns:
        array: Cross-validation scores.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    return cv_scores


# Main function to execute the workflow
def main():
    # File path and target column
    file_path = 'roberta_test_2.csv'
    target_column = 'output'

    # Load the dataset
    X, y = load_dataset(file_path, target_column)

    # Preprocess the data (standardization)
    X_scaled = preprocess_data(X)

    # Apply PCA for dimension reduction
    X_pca = apply_pca(X_scaled)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

    # Support Vector Regression model
    model = SVR()

    # Define parameter grid for RandomizedSearchCV
    param_dist = {
        'kernel': ['linear', 'rbf', 'poly'],  # Different kernel functions
        'C': [0.1, 1, 10, 100],  # Regularization parameter
        'epsilon': [0.1, 0.2, 0.5, 0.3],  # Epsilon for the epsilon-tube
        'degree': [2, 3],  # Degree of the polynomial kernel (if applicable)
    }

    # Perform RandomizedSearchCV for model hyperparameter tuning
    random_search = tune_model_with_random_search(model, X_train, y_train, param_dist)

    # Perform cross-validation
    cv_scores = cross_validate_model(random_search.best_estimator_, X_train, y_train)

    # Make predictions on the test set
    y_pred = random_search.best_estimator_.predict(X_test)

    # Evaluate model performance
    metrics = evaluate_model(y_test, y_pred)

    # Output the results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"R^2 Score: {metrics['R^2']}")
    print(f"RMSE: {metrics['RMSE']}")
    print(f"MAE: {metrics['MAE']}")
    print(f"Mean of Output: {metrics['Mean of Output']}")
    print(f"Standard Deviation of Output: {metrics['Standard Deviation of Output']}")
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Cross-Validation Mean: {cv_scores.mean()}")
    print(f"Cross-Validation Std Dev: {cv_scores.std()}")


if __name__ == "__main__":
    main()


Best Parameters: {'kernel': 'rbf', 'epsilon': 0.3, 'degree': 3, 'C': 100}
R^2 Score: 0.4892619949836724
RMSE: 1.4064699791146675
MAE: 1.1220054585976629
Mean of Output: 1.9384615384615385
Standard Deviation of Output: 1.968028479131986
Cross-Validation Scores: [0.30643979 0.32267058 0.2641492 ]
Cross-Validation Mean: 0.29775318831807834
Cross-Validation Std Dev: 0.024668205101982726
