# K-Nearest Neighbors (KNN) Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor

# Function to load dataset
def load_data(file_path, target_column):
    """
    Load dataset and separate features and target column.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        X, y (DataFrame, Series): Features and target data.
    """
    data = pd.read_csv(file_path)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

# Function to perform grid search for best model
def grid_search_knn(X_train, y_train, param_grid):
    """
    Perform GridSearchCV for KNN model hyperparameter tuning.

    Parameters:
        X_train (DataFrame): Training features.
        y_train (Series): Training target.
        param_grid (dict): Hyperparameter grid for tuning.

    Returns:
        GridSearchCV: Best KNN model found from grid search.
    """
    knn = KNeighborsRegressor()
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Calculate performance metrics for the model.

    Parameters:
        y_test (array): True target values.
        y_pred (array): Predicted target values.

    Returns:
        dict: Dictionary containing R², RMSE, MAE, mean, and std dev.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    output_mean = np.mean(y_test)
    output_std_dev = np.std(y_test)
    return {
        "R² Score": r2, 
        "RMSE": rmse,


Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.4s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=distance; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, 

# Polynomial Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Function to load and prepare the dataset
def load_and_prepare_data(file_path, target_column):
    """
    Load dataset and separate features and target column.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): Name of the target column.

    Returns:
        X (DataFrame): Feature data.
        y (Series): Target data.
    """
    data = pd.read_csv(file_path)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

# Function to create a pipeline with PolynomialFeatures and LinearRegression
def create_pipeline():
    """
    Create a pipeline for scaling, polynomial feature transformation, and linear regression.
    
    Returns:
        Pipeline: A scikit-learn pipeline.
    """
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize features
        ('poly', PolynomialFeatures()),  # Polynomial features transformation
        ('linear', LinearRegression())  # Linear regression model
    ])
    return pipeline

# Function for hyperparameter tuning using GridSearchCV
def perform_grid_search(pipeline, X_train, y_train, param_grid, cv):
    """
    Perform GridSearchCV to find the best model and parameters.
    
    Parameters:
        pipeline (Pipeline): The pipeline to be used for grid search.
        X_train (DataFrame): Training features.
        y_train (Series): Training target.
        param_grid (dict): Hyperparameter grid for tuning.
        cv (KFold): Cross-validation strategy.
        
    Returns:
        GridSearchCV: The best model from grid search.
    """
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=2)
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate the model on the test set
def evaluate_model(y_test, y_pred):
    """
    Evaluate model performance using R², RMSE, MAE, and output statistics.
    
    Parameters:
        y_test (array): True target values.
        y_pred (array): Predicted target values.
        
    Returns:
        dict: Dictionary containing evaluation metrics.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    output_mean = np.mean(y_test)
    output_std_dev = np.std(y_test)
    
    return {
        "R² Score": r2, 
        "RMSE": rmse, 
        "MAE": mae, 
        "Mean of Output": output_mean, 
        "Standard Deviation of Output": output_std_dev
    }

# Function for cross-validation
def cross_validate_model(best_model, X, y, cv):
    """
    Perform cross-validation on the best model and return cross-validation scores.
    
    Parameters:
        best_model (estimator): The best model from grid search.
        X (DataFrame): Feature data.
        y (Series): Target data.
        cv (KFold): Cross-validation strategy.
        
    Returns:
        dict: Dictionary containing cross-validation scores and statistics.
    """
    cv_scores = cross_val_score(best_model, X, y, cv=cv, scoring='neg_mean_squared_error')
    cv_scores = np.sqrt(-cv_scores)  # Convert to positive RMSE values
    cv_mean = np.mean(cv_scores)
    cv_std_dev = np.std(cv_scores)
    
    return {
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev
    }

# Function to print results
def print_results(grid_search, evaluation_metrics, cross_validation_results):
    """
    Print the results of grid search, evaluation metrics, and cross-validation.
    
    Parameters:
        grid_search (GridSearchCV): The grid search results.
        evaluation_metrics (dict): Dictionary containing evaluation metrics.
        cross_validation_results (dict): Dictionary containing cross-validation results.
    """
    print(f"Best Parameters: {grid_search.best_params_}")
    for metric, value in evaluation_metrics.items():
        print(f"{metric}: {value}")
    for metric, value in cross_validation_results.items():
        print(f"{metric}: {value}")

# Main function to run the pipeline
def main():
    # Load the dataset
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    X, y = load_and_prepare_data(file_path, target_column)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create pipeline
    pipeline = create_pipeline()

    # Define parameter grid for grid search
    param_grid = {
        'poly__degree': [2, 3, 4],  # Testing various degrees of polynomial features
        'linear__fit_intercept': [True, False]  # Fit intercept or not
    }

    # Set up cross-validation
    cv = KFold(n_splits=10, shuffle=True, random_state=42)

    # Perform grid search
    grid_search = perform_grid_search(pipeline, X_train, y_train, param_grid, cv)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_model.predict(X_test)

    # Evaluate the model
    evaluation_metrics = evaluate_model(y_test, y_pred)

    # Perform cross-validation
    cross_validation_results = cross_validate_model(best_model, X, y, cv)

    # Print all results
    print_results(grid_search, evaluation_metrics, cross_validation_results)

# Run the main function
if __name__ == "__main__":
    main()


Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time= 1.1min
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time= 1.0min
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time= 1.1min
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time= 1.1min
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  52.0s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  52.3s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.2s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  46.6s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  56.4s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  46.6s
[CV] END .........linear__fit_intercept=True, poly__degree=3; total time=   0.0s
[CV] END .........linear__fit_intercept=True, po

40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Best Parameters: {'linear__fit_intercept': True, 'poly__degree': 2}
R² Score: -1.379757815705473e+22
RMSE: 145386420065.53198
MAE: 79075578033.52428
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [1.97831501e+12 3.23862638e+11 3.59077655e+11 3.68109949e+11
 1.93279233e+11 3.07736375e+11 1.24655787e+11 4.48964590e+11
 3.08282727e+11 3.00107986e+11]
Cross-Validation Mean: 471239195383.9152
Cross-Validation Std Dev: 509574827739.6164


# Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Function to load and prepare data
def load_and_prepare_data(file_path, target_column):
    """
    Load dataset and separate features and target column.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.

    Returns:
        X (DataFrame): Feature data.
        y (Series): Target data.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to standardize features
def standardize_features(X):
    """
    Standardize features by applying StandardScaler.
    
    Parameters:
        X (DataFrame): Feature data.

    Returns:
        X_scaled (array): Scaled feature data.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Function to perform grid search for hyperparameter tuning
def perform_grid_search(model, param_grid, X_train, y_train):
    """
    Perform GridSearchCV to find the best model and hyperparameters.
    
    Parameters:
        model (estimator): Model to tune.
        param_grid (dict): Hyperparameter grid for tuning.
        X_train (DataFrame): Training feature data.
        y_train (Series): Training target data.
        
    Returns:
        grid_search (GridSearchCV): The grid search object with the best model.
    """
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search

# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    """
    Evaluate model performance using R², RMSE, MAE, and output statistics.
    
    Parameters:
        y_test (array): True target values.
        y_pred (array): Predicted target values.
        
    Returns:
        dict: Evaluation metrics including R², RMSE, MAE, and output statistics.
    """
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    output_mean = np.mean(y_test)
    output_std_dev = np.std(y_test)
    
    return {
        "R² Score": r2, 
        "RMSE": rmse, 
        "MAE": mae, 
        "Mean of Output": output_mean, 
        "Standard Deviation of Output": output_std_dev
    }

# Function to perform cross-validation and calculate RMSE
def cross_validate_model(model, X_train, y_train):
    """
    Perform cross-validation on the model and return cross-validation scores.
    
    Parameters:
        model (estimator): The trained model.
        X_train (DataFrame): Feature data for cross-validation.
        y_train (Series): Target data for cross-validation.
        
    Returns:
        dict: Cross-validation scores and statistics (mean, std deviation).
    """
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
    cv_scores = np.sqrt(-cross_val_scores)  # Convert to positive RMSE values
    cv_mean = np.mean(cv_scores)
    cv_std_dev = np.std(cv_scores)
    
    return {
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev
    }

# Function to print results
def print_results(grid_search, evaluation_metrics, cross_validation_results, std_dev):
    """
    Print the results from grid search, model evaluation, and cross-validation.
    
    Parameters:
        grid_search (GridSearchCV): The grid search object with best parameters.
        evaluation_metrics (dict): Dictionary with evaluation metrics.
        cross_validation_results (dict): Dictionary with cross-validation results.
        std_dev (float): Standard deviation of model predictions.
    """
    print(f"Best Parameters: {grid_search.best_params_}")
    for metric, value in evaluation_metrics.items():
        print(f"{metric}: {value}")
    for metric, value in cross_validation_results.items():
        print(f"{metric}: {value}")
    print(f"Standard Deviation of Predictions: {std_dev}")

# Main function to run the complete process
def main():
    # Load and prepare data
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    X, y = load_and_prepare_data(file_path, target_column)
    
    # Standardize features
    X_scaled = standardize_features(X)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Define the Logistic Regression model
    model = LogisticRegression(max_iter=1000)
    
    # Define parameter grid for grid search
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear'],
        'penalty': ['l2'],  # l2 regularization for Logistic Regression
    }
    
    # Perform grid search
    grid_search = perform_grid_search(model, param_grid, X_train, y_train)
    
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test)
    
    # Evaluate the model
    evaluation_metrics = evaluate_model(y_test, y_pred)
    
    # Perform cross-validation
    cross_validation_results = cross_validate_model(best_model, X_train, y_train)
    
    # Calculate Standard Deviation of Predictions
    std_dev = np.std(y_pred)
    
    # Print all results
    print_results(grid_search, evaluation_metrics, cross_validation_results, std_dev)

# Run the main function
if __name__ == "__main__":
    main()


Best parameters found:  {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
R² Score: 0.28220215451345676
RMSE: 1.0486329178077347
MAE: 0.5977859778597786
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [1.03161048 0.98614252 0.98614252 1.0794154  1.04527154 0.89235436
 0.9953596  0.81649658 1.01835015 0.94770678]
Cross-Validation Mean: 0.9798849937880721
Cross-Validation Std Dev: 0.07351693517584089
Standard Deviation of Predictions: 1.214498723175727


# Linear Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Function to load and prepare the data
def load_and_prepare_data(file_path, target_column):
    """
    Load and prepare the dataset by separating features and target.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.
        
    Returns:
        X (ndarray): Features data.
        y (ndarray): Target data.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y

# Function to normalize the data
def normalize_data(X):
    """
    Normalize features using StandardScaler.
    
    Parameters:
        X (ndarray): Feature data.
        
    Returns:
        X_scaled (ndarray): Scaled feature data.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Function to perform grid search with cross-validation
def perform_grid_search(model, X, y, param_grid=None, cv_folds=10):
    """
    Perform GridSearchCV to find the best model and hyperparameters.
    
    Parameters:
        model (estimator): The model to tune.
        X (ndarray): Feature data.
        y (ndarray): Target data.
        param_grid (dict): Hyperparameter grid for tuning (default is empty for LinearRegression).
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        grid_search (GridSearchCV): The grid search object with the best model.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    return grid_search

# Function to evaluate the model
def evaluate_model(model, X, y, cv_folds=10):
    """
    Evaluate the model using cross-validation and calculate metrics like MSE, RMSE, R², etc.
    
    Parameters:
        model (estimator): The trained model.
        X (ndarray): Feature data.
        y (ndarray): Target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        metrics (dict): Calculated metrics including RMSE, MAE, R², and cross-validation results.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation scores for MSE and R²
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    
    # Calculate metrics
    mse_mean = -mse_scores.mean()  # Convert negative MSE to positive
    rmse = np.sqrt(mse_mean)       # RMSE
    mae = mean_absolute_error(y, model.predict(X))  # MAE
    r2_mean = r2_scores.mean()     # R²
    std_dev_mse = np.std(mse_scores)  # Standard deviation of MSE
    
    # Cross-validation RMSE
    cv_scores = np.sqrt(-mse_scores)  # Convert negative MSE to positive RMSE
    cv_mean = cv_scores.mean()        # Cross-validation mean RMSE
    cv_std_dev = cv_scores.std()      # Cross-validation std dev RMSE
    
    # Additional statistics
    output_mean = np.mean(y)         # Mean of Output
    output_std_dev = np.std(y)       # Standard deviation of Output
    
    # Store metrics in a dictionary
    metrics = {
        "RMSE": rmse,
        "MAE": mae,
        "R²": r2_mean,
        "Mean of Output": output_mean,
        "Standard Deviation of Output": output_std_dev,
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev,
        "Standard Deviation of MSE": std_dev_mse
    }
    
    return metrics

# Function to display results
def print_results(metrics):
    """
    Print the evaluation metrics.
    
    Parameters:
        metrics (dict): Dictionary of calculated metrics.
    """
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

# Main function to run the process
def main():
    # Load and prepare data
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    X, y = load_and_prepare_data(file_path, target_column)
    
    # Normalize the data
    X_scaled, scaler = normalize_data(X)
    
    # Initialize the Linear Regression model
    model = LinearRegression()
    
    # Perform grid search (no hyperparameters for LinearRegression)
    grid_search = perform_grid_search(model, X_scaled, y)
    
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Evaluate the model using cross-validation and calculate metrics
    metrics = evaluate_model(best_model, X_scaled, y)
    
    # Print the results
    print_results(metrics)

# Run the main function
if __name__ == "__main__":
    main()


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5    embed_6  \
0  0.088828 -0.161809 -0.061903 -0.041983 -0.216466 -0.119152  11.951169   
1  0.146692 -0.119815 -0.979210  0.085807  0.008980 -0.136545   1.863782   
2  0.018409 -0.323487 -0.255188 -0.175120  0.453114  0.234411   7.320464   
3 -0.189907 -0.307564 -0.385835 -0.038812  0.373633  0.196549   9.743107   
4 -0.007651 -0.369977 -0.555113 -0.006253  0.088651  0.129726   7.823362   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.126399  0.356953  0.219979  ...   1.018518  -0.063625  -0.211650   
1  0.073784  0.191401  0.213136  ...   0.062188   0.018683  -0.260519   
2 -0.036020 -0.241820 -0.175831  ...  -0.228277  -0.186306   0.217426   
3 -0.085457 -0.206629 -0.151518  ...   0.292949  -0.014901   0.475006   
4  0.179337 -0.134409  0.088882  ...  -0.025282   0.461727   0.503787   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   4.267710   0.141019   

# Lasso Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Function to load and prepare the data
def load_and_prepare_data(file_path, target_column):
    """
    Load and prepare the dataset by separating features and target.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.
        
    Returns:
        X (ndarray): Features data.
        y (ndarray): Target data.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values
    return X, y

# Function to normalize the data
def normalize_data(X):
    """
    Normalize features using StandardScaler.
    
    Parameters:
        X (ndarray): Feature data.
        
    Returns:
        X_scaled (ndarray): Scaled feature data.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Function to perform grid search with cross-validation
def perform_grid_search(model, X, y, param_grid, cv_folds=10):
    """
    Perform GridSearchCV to find the best model and hyperparameters.
    
    Parameters:
        model (estimator): The model to tune.
        X (ndarray): Feature data.
        y (ndarray): Target data.
        param_grid (dict): Hyperparameter grid for tuning.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        grid_search (GridSearchCV): The grid search object with the best model.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    return grid_search

# Function to evaluate the model
def evaluate_model(model, X, y, cv_folds=10):
    """
    Evaluate the model using cross-validation and calculate metrics like MSE, RMSE, R², etc.
    
    Parameters:
        model (estimator): The trained model.
        X (ndarray): Feature data.
        y (ndarray): Target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        metrics (dict): Calculated metrics including RMSE, MAE, R², and cross-validation results.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation scores for MSE and R²
    mse_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    
    # Calculate metrics
    mse_mean = -mse_scores.mean()  # Convert negative MSE to positive
    rmse = np.sqrt(mse_mean)       # RMSE
    mae = mean_absolute_error(y, model.predict(X))  # MAE
    r2_mean = r2_scores.mean()     # R²
    std_dev_mse = np.std(mse_scores)  # Standard deviation of MSE
    
    # Cross-validation RMSE
    cv_scores = np.sqrt(-mse_scores)  # Convert negative MSE to positive RMSE
    cv_mean = cv_scores.mean()        # Cross-validation mean RMSE
    cv_std_dev = cv_scores.std()      # Cross-validation std dev RMSE
    
    # Additional statistics
    output_mean = np.mean(y)         # Mean of Output
    output_std_dev = np.std(y)       # Standard deviation of Output
    
    # Store metrics in a dictionary
    metrics = {
        "RMSE": rmse,
        "MAE": mae,
        "R²": r2_mean,
        "Mean of Output": output_mean,
        "Standard Deviation of Output": output_std_dev,
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev,
        "Standard Deviation of MSE": std_dev_mse
    }
    
    return metrics

# Function to display results
def print_results(metrics):
    """
    Print the evaluation metrics.
    
    Parameters:
        metrics (dict): Dictionary of calculated metrics.
    """
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

# Main function to run the process
def main():
    # Load and prepare data
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    X, y = load_and_prepare_data(file_path, target_column)
    
    # Normalize the data
    X_scaled, scaler = normalize_data(X)
    
    # Initialize the Lasso Regression model
    model = Lasso()
    
    # Perform grid search with hyperparameter tuning
    grid_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}  # Regularization strength for Lasso
    grid_search = perform_grid_search(model, X_scaled, y, grid_params)
    
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    
    # Evaluate the model using cross-validation and calculate metrics
    metrics = evaluate_model(best_model, X_scaled, y)
    
    # Print the results
    print_results(metrics)

# Run the main function
if __name__ == "__main__":
    main()


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5    embed_6  \
0  0.088828 -0.161809 -0.061903 -0.041983 -0.216466 -0.119152  11.951169   
1  0.146692 -0.119815 -0.979210  0.085807  0.008980 -0.136545   1.863782   
2  0.018409 -0.323487 -0.255188 -0.175120  0.453114  0.234411   7.320464   
3 -0.189907 -0.307564 -0.385835 -0.038812  0.373633  0.196549   9.743107   
4 -0.007651 -0.369977 -0.555113 -0.006253  0.088651  0.129726   7.823362   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.126399  0.356953  0.219979  ...   1.018518  -0.063625  -0.211650   
1  0.073784  0.191401  0.213136  ...   0.062188   0.018683  -0.260519   
2 -0.036020 -0.241820 -0.175831  ...  -0.228277  -0.186306   0.217426   
3 -0.085457 -0.206629 -0.151518  ...   0.292949  -0.014901   0.475006   
4  0.179337 -0.134409  0.088882  ...  -0.025282   0.461727   0.503787   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   4.267710   0.141019   

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha (regularization strength): 0.1
R² Score: 0.2175104477608861
RMSE: 1.055485375336476
MAE: 0.8255355608454793
Mean of Output: 3.37
Standard Deviation of Output: 1.2120643547270913
Cross-Validation Scores: [0.97409814 1.07351147 1.1636563  1.1322387  1.00647335 1.18236412
 1.026405   0.99413963 0.96097173 1.01334999]
Cross-Validation Mean: 1.0527208450637222
Cross-Validation Std Dev: 0.07634264809072241
Standard Deviation of MSE: 0.16395240909413894


# Decision Tree Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Function to load and prepare data
def load_and_prepare_data(file_path, target_column):
    """
    Load and prepare the dataset by separating features and target.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.
        
    Returns:
        X (ndarray): Features data.
        y (ndarray): Target data.
    """
    df = pd.read_csv(file_path)
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Function to normalize and apply PCA
def normalize_and_reduce_dimensionality(X, variance_threshold=0.95):
    """
    Normalize the features and apply PCA for dimensionality reduction.
    
    Parameters:
        X (ndarray): Feature data.
        variance_threshold (float): The proportion of variance to retain with PCA.
        
    Returns:
        X_pca (ndarray): PCA-transformed feature data.
    """
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply PCA to retain 'variance_threshold' proportion of variance
    pca = PCA(n_components=variance_threshold)
    X_pca = pca.fit_transform(X_scaled)
    
    return X_pca

# Function to perform RandomizedSearchCV and return best model
def tune_model(model, param_dist, X_train, y_train, cv_folds=5):
    """
    Tune the model using RandomizedSearchCV.
    
    Parameters:
        model (estimator): The model to tune.
        param_dist (dict): Hyperparameter distribution for RandomizedSearchCV.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        best_model (estimator): The best model found by RandomizedSearchCV.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=cv_folds, random_state=42)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

# Function to evaluate the model and calculate metrics
def evaluate_model(model, X_test, y_test, X_train, y_train, cv_folds=5):
    """
    Evaluate the model using various metrics: R², RMSE, MAE, etc.
    
    Parameters:
        model (estimator): The trained model.
        X_test (ndarray): Test feature data.
        y_test (ndarray): Test target data.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        metrics (dict): Calculated metrics.
    """
    # Cross-validation scores
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    
    # Predictions on test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    cv_mean = cv_scores.mean()
    cv_std_dev = cv_scores.std()

    # Store the metrics in a dictionary
    metrics = {
        "R^2 Score": r2,
        "RMSE": rmse,
        "MAE": mae,
        "Mean of Output": mean_value,
        "Standard Deviation of Output": std_dev,
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev
    }
    
    return metrics

# Function to display results
def print_results(metrics):
    """
    Print the evaluation metrics in a readable format.
    
    Parameters:
        metrics (dict): Dictionary of calculated metrics.
    """
    for metric, value in metrics.items():
        if isinstance(value, np.ndarray):
            print(f"{metric}: {value[:5]}... (first 5 values)")
        else:
            print(f"{metric}: {value}")

# Main function to run the process
def main():
    # Load and prepare data
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    X, y = load_and_prepare_data(file_path, target_column)
    
    # Normalize the data and apply PCA
    X_pca = normalize_and_reduce_dimensionality(X)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
    
    # Initialize the Decision Tree Regressor
    model = DecisionTreeRegressor()
    
    # Parameter tuning using RandomizedSearchCV
    param_dist = {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': np.arange(2, 10),
        'min_samples_leaf': np.arange(1, 5),
        'max_features': ['auto', 'sqrt', 'log2', None]
    }
    
    best_model, best_params = tune_model(model, param_dist, X_train, y_train)
    
    # Evaluate the model
    metrics = evaluate_model(best_model, X_test, y_test, X_train, y_train)
    
    # Print results
    print(f"Best Parameters: {best_params}")
    print_results(metrics)

# Run the main function
if __name__ == "__main__":
    main()


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise Inva

Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None, 'max_depth': 40}
R^2 Score: 0.24997756848855368
RMSE: 1.0719129989041416
MAE: 0.7497012827271129
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [ 0.18342688 -0.04131913  0.2562512   0.18414116 -0.01076239]
Cross-Validation Mean: 0.11434754471104336
Cross-Validation Std Dev: 0.11803758848966257


# Bayesian Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import BayesianRidge

# Function to load and prepare data
def load_and_prepare_data(file_path, target_column):
    """
    Load and prepare the dataset by separating features and target.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.
        
    Returns:
        X (ndarray): Features data.
        y (ndarray): Target data.
    """
    try:
        df = pd.read_csv(file_path)
        X = df.drop(columns=[target_column])
        y = df[target_column]
        return X, y
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

# Function to normalize and apply PCA
def normalize_and_reduce_dimensionality(X, variance_threshold=0.95):
    """
    Normalize the features and apply PCA for dimensionality reduction.
    
    Parameters:
        X (ndarray): Feature data.
        variance_threshold (float): The proportion of variance to retain with PCA.
        
    Returns:
        X_pca (ndarray): PCA-transformed feature data.
        pca (PCA object): Fitted PCA model to access explained variance.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    pca = PCA(n_components=variance_threshold)
    X_pca = pca.fit_transform(X_scaled)
    
    # Print the explained variance ratio of the components
    print(f"Explained variance ratio by components: {pca.explained_variance_ratio_}")
    print(f"Total variance retained: {np.sum(pca.explained_variance_ratio_)}")
    
    return X_pca, pca

# Function to perform RandomizedSearchCV and return best model
def tune_model(model, param_dist, X_train, y_train, cv_folds=5):
    """
    Tune the model using RandomizedSearchCV.
    
    Parameters:
        model (estimator): The model to tune.
        param_dist (dict): Hyperparameter distribution for RandomizedSearchCV.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        best_model (estimator): The best model found by RandomizedSearchCV.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, cv=cv_folds, random_state=42)  # Increased n_iter for better search
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

# Function to evaluate the model and calculate metrics
def evaluate_model(model, X_test, y_test, X_train, y_train, cv_folds=5):
    """
    Evaluate the model using various metrics: R², RMSE, MAE, etc.
    
    Parameters:
        model (estimator): The trained model.
        X_test (ndarray): Test feature data.
        y_test (ndarray): Test target data.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        metrics (dict): Calculated metrics.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    mean_value = np.mean(y_test)
    std_dev = np.std(y_test)
    cv_mean = cv_scores.mean()
    cv_std_dev = cv_scores.std()

    metrics = {
        "R^2 Score": r2,
        "RMSE": rmse,
        "MAE": mae,
        "Mean of Output": mean_value,
        "Standard Deviation of Output": std_dev,
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev
    }
    
    return metrics

# Function to print results in a readable format
def print_results(metrics):
    for metric, value in metrics.items():
        if isinstance(value, np.ndarray):
            print(f"{metric}: {value[:5]}... (first 5 values)")
        else:
            print(f"{metric}: {value}")

# Main function to run the process
def main():
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    
    X, y = load_and_prepare_data(file_path, target_column)
    if X is None or y is None:
        return  # Exit if there was an error loading the data

    X_pca, pca = normalize_and_reduce_dimensionality(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
    
    model = BayesianRidge()
    
    param_dist = {
        'alpha_1': np.logspace(-6, -1, 6),
        'alpha_2': np.logspace(-6, -1, 6),
        'lambda_1': np.logspace(-6, -1, 6),
        'lambda_2': np.logspace(-6, -1, 6)
    }
    
    best_model, best_params = tune_model(model, param_dist, X_train, y_train)
    
    metrics = evaluate_model(best_model, X_test, y_test, X_train, y_train)
    
    print(f"Best Parameters: {best_params}")
    print_results(metrics)

if __name__ == "__main__":
    main()


Best Parameters: {'lambda_2': 0.0001, 'lambda_1': 0.1, 'alpha_2': 0.1, 'alpha_1': 0.001}
R^2 Score: 0.38821758790698147
RMSE: 0.968101526926716
MAE: 0.7995184038566036
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.33302722 0.29569942 0.44906524 0.38284709 0.35967839]
Cross-Validation Mean: 0.36406347110785237
Cross-Validation Std Dev: 0.051447538182813425


# Random Forest Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Function to load and prepare data
def load_and_prepare_data(file_path, target_column):
    """
    Load and prepare the dataset by separating features and target.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.
        
    Returns:
        X (ndarray): Features data.
        y (ndarray): Target data.
    """
    try:
        df = pd.read_csv(file_path)
        X = df.drop(columns=[target_column])
        y = df[target_column]
        return X, y
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

# Function to standardize the data and apply PCA
def standardize_and_reduce_dimensionality(X, variance_threshold=0.95):
    """
    Standardize the features and apply PCA for dimensionality reduction.
    
    Parameters:
        X (ndarray): Feature data.
        variance_threshold (float): The proportion of variance to retain with PCA.
        
    Returns:
        X_pca (ndarray): PCA-transformed feature data.
        pca (PCA object): Fitted PCA model to access explained variance.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    pca = PCA(n_components=variance_threshold)
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"Explained variance ratio by components: {pca.explained_variance_ratio_}")
    print(f"Total variance retained: {np.sum(pca.explained_variance_ratio_)}")
    
    return X_pca, pca

# Function to perform RandomizedSearchCV and return the best model
def tune_model(model, param_dist, X_train, y_train, cv_folds=3):
    """
    Tune the model using RandomizedSearchCV.
    
    Parameters:
        model (estimator): The model to tune.
        param_dist (dict): Hyperparameter distribution for RandomizedSearchCV.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        best_model (estimator): The best model found by RandomizedSearchCV.
    """
    random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=cv_folds, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    return random_search.best_estimator_, random_search.best_params_

# Function to evaluate the model and calculate performance metrics
def evaluate_model(model, X_test, y_test, X_train, y_train, cv_folds=3):
    """
    Evaluate the model using various metrics: R², RMSE, MAE, etc.
    
    Parameters:
        model (estimator): The trained model.
        X_test (ndarray): Test feature data.
        y_test (ndarray): Test target data.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        metrics (dict): Calculated metrics.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf)
    
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    output_mean = np.mean(y_test)
    output_std_dev = np.std(y_test)
    cv_mean = cv_scores.mean()
    cv_std_dev = cv_scores.std()

    metrics = {
        "R^2 Score": r2,
        "RMSE": rmse,
        "MAE": mae,
        "Mean of Output": output_mean,
        "Standard Deviation of Output": output_std_dev,
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev
    }
    
    return metrics

# Function to print results in a readable format
def print_results(metrics):
    for metric, value in metrics.items():
        if isinstance(value, np.ndarray):
            print(f"{metric}: {value[:5]}... (first 5 values)")
        else:
            print(f"{metric}: {value}")

# Main function to run the entire process
def main():
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    
    X, y = load_and_prepare_data(file_path, target_column)
    if X is None or y is None:
        return  # Exit if there was an error loading the data

    X_pca, pca = standardize_and_reduce_dimensionality(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
    
    model = RandomForestRegressor()
    
    param_dist = {
        'n_estimators': [10, 50],  # Reduced number of estimators
        'max_depth': [None, 10],  # Reduced max_depth options
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True]  # Keeping only one option for bootstrap
    }
    
    best_model, best_params = tune_model(model, param_dist, X_train, y_train)
    
    metrics = evaluate_model(best_model, X_test, y_test, X_train, y_train)
    
    print(f"Best Parameters: {best_params}")
    print_results(metrics)

if __name__ == "__main__":
    main()


Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': True}
R^2 Score: 0.5138861494944564
RMSE: 0.8629614769845486
MAE: 0.6788114273999883
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.41714592 0.44639004 0.49721016]
Cross-Validation Mean: 0.45358203813384307
Cross-Validation Std Dev: 0.03307934109968348


## Gausian Process regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

# Function to load and prepare data
def load_and_prepare_data(file_path, target_column):
    """
    Load and prepare the dataset by separating features and target.
    
    Parameters:
        file_path (str): Path to the dataset file.
        target_column (str): The target column name in the dataset.
        
    Returns:
        X (ndarray): Features data.
        y (ndarray): Target data.
    """
    try:
        df = pd.read_csv(file_path)
        X = df.drop(columns=[target_column])
        y = df[target_column]
        return X, y
    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

# Function to standardize the data and apply PCA
def standardize_and_reduce_dimensionality(X, variance_threshold=0.95):
    """
    Standardize the features and apply PCA for dimensionality reduction.
    
    Parameters:
        X (ndarray): Feature data.
        variance_threshold (float): The proportion of variance to retain with PCA.
        
    Returns:
        X_pca (ndarray): PCA-transformed feature data.
        pca (PCA object): Fitted PCA model to access explained variance.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    pca = PCA(n_components=variance_threshold)
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"Explained variance ratio by components: {pca.explained_variance_ratio_}")
    print(f"Total variance retained: {np.sum(pca.explained_variance_ratio_)}")
    
    return X_pca, pca

# Function to define and fit the Gaussian Process Regression model
def fit_gaussian_process(X_train, y_train):
    """
    Define and fit the Gaussian Process Regression model.
    
    Parameters:
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        
    Returns:
        model (GaussianProcessRegressor): Trained Gaussian Process model.
    """
    kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))  # Constant * Radial Basis Function
    model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
    model.fit(X_train, y_train)
    return model

# Function to perform cross-validation
def cross_validate_model(model, X_train, y_train, cv_folds=3):
    """
    Perform cross-validation on the trained model.
    
    Parameters:
        model (estimator): Trained model.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        cv_scores (ndarray): Cross-validation scores.
    """
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')
    return cv_scores

# Function to evaluate the model and calculate performance metrics
def evaluate_model(model, X_test, y_test, X_train, y_train, cv_folds=3):
    """
    Evaluate the model using various metrics: R², RMSE, MAE, etc.
    
    Parameters:
        model (estimator): The trained model.
        X_test (ndarray): Test feature data.
        y_test (ndarray): Test target data.
        X_train (ndarray): Training feature data.
        y_train (ndarray): Training target data.
        cv_folds (int): Number of cross-validation folds.
        
    Returns:
        metrics (dict): Calculated metrics.
    """
    cv_scores = cross_validate_model(model, X_train, y_train, cv_folds)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    output_mean = np.mean(y_test)
    output_std_dev = np.std(y_test)
    cv_mean = cv_scores.mean()
    cv_std_dev = cv_scores.std()

    metrics = {
        "R^2 Score": r2,
        "RMSE": rmse,
        "MAE": mae,
        "Mean of Output": output_mean,
        "Standard Deviation of Output": output_std_dev,
        "Cross-Validation Scores": cv_scores,
        "Cross-Validation Mean": cv_mean,
        "Cross-Validation Std Dev": cv_std_dev
    }
    
    return metrics

# Function to print results in a readable format
def print_results(metrics):
    for metric, value in metrics.items():
        if isinstance(value, np.ndarray):
            print(f"{metric}: {value[:5]}... (first 5 values)")
        else:
            print(f"{metric}: {value}")

# Main function to run the entire process
def main():
    file_path = 'gpt2_test_2.csv'
    target_column = 'output'
    
    X, y = load_and_prepare_data(file_path, target_column)
    if X is None or y is None:
        return  # Exit if there was an error loading the data

    X_pca, pca = standardize_and_reduce_dimensionality(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)
    
    model = fit_gaussian_process(X_train, y_train)
    
    metrics = evaluate_model(model, X_test, y_test, X_train, y_train)
    
    print(f"Kernel Used: {model.kernel_}")
    print_results(metrics)

if __name__ == "__main__":
    main()


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


Kernel Used: 3.06**2 * RBF(length_scale=8.07)
R^2 Score: 0.20502505929057968
RMSE: 1.7547210995549751
MAE: 1.0603733358612224
Mean of Output: 1.9384615384615385
Standard Deviation of Output: 1.968028479131986
Cross-Validation Scores: [-0.03958691  0.33522902 -0.52688427]
Cross-Validation Mean: -0.07708072118671372
Cross-Validation Std Dev: 0.3529534185213218


## Support vector regressor

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR

# Load the dataset
df = pd.read_csv('gpt2_test_2.csv')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define SVR model
svr = SVR()

# Parameter tuning using GridSearchCV
param_grid = {
    'kernel': ['linear', 'rbf'],  # Linear and RBF kernels
    'C': [0.1, 1, 10],  # Regularization parameter
    'epsilon': [0.1, 0.2, 0.5]  # Epsilon in the epsilon-tube
}

grid_search = GridSearchCV(svr, param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
cv_scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=kf, scoring='r2')

# Predictions
y_pred = grid_search.best_estimator_.predict(X_test)

# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
output_mean = np.mean(y_test)
output_std_dev = np.std(y_test)
cv_mean = cv_scores.mean()
cv_std_dev = cv_scores.std()

# Print results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {output_mean}")
print(f"Standard Deviation of Output: {output_std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_mean}")
print(f"Cross-Validation Std Dev: {cv_std_dev}")


Best Parameters: {'C': 10, 'epsilon': 0.5, 'kernel': 'rbf'}
R^2 Score: 0.4499167606357296
RMSE: 1.4596394244666246
MAE: 1.1750491433636607
Mean of Output: 1.9384615384615385
Standard Deviation of Output: 1.968028479131986
Cross-Validation Scores: [0.34468544 0.28100373 0.26212836]
Cross-Validation Mean: 0.2959391768193105
Cross-Validation Std Dev: 0.03531966894150644
