# K-Nearest Neighbors (KNN) Regression

In [12]:
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Load the dataset (replace 'dataset.xlsx' with the actual file)
data = pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Separate features (embeddings) and output
X = data.drop(columns=['output'])
y = data['output']

# Train-test split (optional, useful for additional validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the K-Nearest Neighbors Regressor
knn = KNeighborsRegressor()

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Testing various numbers of neighbors
    'weights': ['uniform', 'distance'],  # Weight by distance or uniform
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Different algorithms to compute nearest neighbors
    'p': [1, 2]  # Distance metrics (1: Manhattan, 2: Euclidean)
}

# Set up the grid search with 10-fold cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best estimator from the grid search
best_knn = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_knn.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Fit the KNN model
knn.fit(X_train, y_train)

# Predict the test set results
y_pred = knn.predict(X_test)

# Calculate R² value
r2 = r2_score(y_test, y_pred)
# Calculate the standard deviation of the predictions
std_dev = np.std(y_pred)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse}')
print(f"R² value: {r2}")
print(f'Standard Deviation of Predictions: {std_dev}')


Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.5s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=auto, n_neighbors=3, p=1, weights=distance; total time=   0.1s
[CV] END algorithm=auto, n_neighbors=3, p=1, 

# Polynomial Regression

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset (replace 'bert_embeddings_input_output_custom_columns.xlsx' with the actual file)
data = pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Separate features (embeddings) and output (marks)
X = data.drop(columns=['output'])
y = data['output']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with PolynomialFeatures and LinearRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('poly', PolynomialFeatures()),  # Polynomial features transformation
    ('linear', LinearRegression())  # Linear regression model
])

# Define the parameter grid for GridSearchCV (removed 'normalize' parameter)
param_grid = {
    'poly__degree': [2, 3, 4],  # Different degrees of polynomial features
    'linear__fit_intercept': [True, False]  # Fit intercept or not
}

# Set up 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate R²
r2 = r2_score(y_test, y_pred)
# Calculate the standard deviation of the predictions
std_dev = np.std(y_pred)

# Print the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse}')
print(f'R² value on Test Set: {r2}')
print(f'Standard Deviation of Predictions: {std_dev}')


Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  47.0s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.9s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  49.3s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.8s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  42.2s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  41.1s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  41.7s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  46.5s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  43.7s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  45.3s
[CV] END .........linear__fit_intercept=True, poly__degree=3; total time=   0.0s
[CV] END .........linear__fit_intercept=True, po

40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklea

Best Parameters: {'linear__fit_intercept': False, 'poly__degree': 2}
Root Mean Squared Error (RMSE) on Test Set: 2600922748.5203533
R² value on Test Set: -4.415806999659239e+18
Standard Deviation of Predictions: 2599536849.8859825


# Logistic Regression

In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset (replace with the actual file or DataFrame)
# Assuming the dataset is loaded as a pandas DataFrame 'df'
df = pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Drop 'output' column for X, and separate it as y
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets (optional for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Grid Search for hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l2'],  # l2 regularization for Logistic Regression
}

# Perform 10-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best parameters found by Grid Search
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score: ", r2)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)

# Calculate the standard deviation of the predictions
std_dev = np.std(y_pred)

# Cross-validation RMSE
cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
cv_rmse = np.mean(np.sqrt(-cross_val_scores))
print("Cross-validated RMSE: ", cv_rmse)
print(f'Standard Deviation of Predictions: {std_dev}')


Best parameters found:  {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
R^2 Score:  0.4363600810609023
RMSE:  0.929230130100368
Cross-validated RMSE:  0.9688596222791311
Standard Deviation of Predictions: 1.1057434316551316


# Linear Regression

In [31]:
'''Linear Regression'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df= pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')


# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Linear Regression model
model = LinearRegression()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search (though linear regression doesn't have hyperparameters, this is for extensibility)
grid_params = {}  # No hyperparameters for basic LinearRegression
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0  0.445589  0.415333  0.118074  0.202017  0.293542 -0.090634 -0.329375   
1  0.076474 -0.200395  0.185220  0.198175 -0.013897 -0.115137 -0.008887   
2  0.018093 -0.260901  0.498325  0.019231 -0.152991 -0.289522 -0.052139   
3 -0.036732 -0.256312  0.839563  0.082782 -0.107343 -0.260510  0.153343   
4  0.301357  0.222360  0.552620 -0.363340 -0.269042 -0.081171  0.439332   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.540401  0.128381 -0.090013  ...  -0.294877   0.174255  -0.349183   
1  0.263286 -0.105154 -0.296139  ...  -0.068586  -0.040534  -0.186894   
2 -0.038137 -0.218562  0.039249  ...   0.276947  -0.006225  -0.283879   
3 -0.442882 -0.200667  0.136391  ...   0.111054  -0.022571  -0.307118   
4 -0.398349  0.077302  0.175215  ...   0.302719  -0.347563  -0.256054   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0  -0.066742   0.027607  -0.0626

# Ridge Regression

In [32]:
'''Ridge Regression'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df= pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Ridge Regression model
model = Ridge()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search for hyperparameter tuning (regularization strength 'alpha')
grid_params = {'alpha': [0.01, 0.1, 1, 10, 100]}  # You can add more values if needed
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"Best alpha (regularization strength): {grid_search.best_params_['alpha']}")
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0  0.445589  0.415333  0.118074  0.202017  0.293542 -0.090634 -0.329375   
1  0.076474 -0.200395  0.185220  0.198175 -0.013897 -0.115137 -0.008887   
2  0.018093 -0.260901  0.498325  0.019231 -0.152991 -0.289522 -0.052139   
3 -0.036732 -0.256312  0.839563  0.082782 -0.107343 -0.260510  0.153343   
4  0.301357  0.222360  0.552620 -0.363340 -0.269042 -0.081171  0.439332   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.540401  0.128381 -0.090013  ...  -0.294877   0.174255  -0.349183   
1  0.263286 -0.105154 -0.296139  ...  -0.068586  -0.040534  -0.186894   
2 -0.038137 -0.218562  0.039249  ...   0.276947  -0.006225  -0.283879   
3 -0.442882 -0.200667  0.136391  ...   0.111054  -0.022571  -0.307118   
4 -0.398349  0.077302  0.175215  ...   0.302719  -0.347563  -0.256054   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0  -0.066742   0.027607  -0.0626

# Lasso Regression

In [34]:
'''Lasso Regression'''



import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df= pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Lasso Regression model
model = Lasso()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search for hyperparameter tuning (regularization strength 'alpha')
grid_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}  # You can add more values if needed
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"Best alpha (regularization strength): {grid_search.best_params_['alpha']}")
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0  0.445589  0.415333  0.118074  0.202017  0.293542 -0.090634 -0.329375   
1  0.076474 -0.200395  0.185220  0.198175 -0.013897 -0.115137 -0.008887   
2  0.018093 -0.260901  0.498325  0.019231 -0.152991 -0.289522 -0.052139   
3 -0.036732 -0.256312  0.839563  0.082782 -0.107343 -0.260510  0.153343   
4  0.301357  0.222360  0.552620 -0.363340 -0.269042 -0.081171  0.439332   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.540401  0.128381 -0.090013  ...  -0.294877   0.174255  -0.349183   
1  0.263286 -0.105154 -0.296139  ...  -0.068586  -0.040534  -0.186894   
2 -0.038137 -0.218562  0.039249  ...   0.276947  -0.006225  -0.283879   
3 -0.442882 -0.200667  0.136391  ...   0.111054  -0.022571  -0.307118   
4 -0.398349  0.077302  0.175215  ...   0.302719  -0.347563  -0.256054   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0  -0.066742   0.027607  -0.0626

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha (regularization strength): 0.01
MSE: 0.973829476591962
RMSE: 0.9868279873371864
R²: 0.3103738152553472
Standard Deviation of MSE: 0.22928273282252234


  model = cd_fast.enet_coordinate_descent(


# Decision Tree Regression

In [36]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Decision Tree Regression model
model = DecisionTreeRegressor()

# Parameter tuning using RandomizedSearchCV
param_dist = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 5),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': None, 'max_depth': 40}
R^2 Score: 0.11112566316271821
RMSE: 1.16692406227555
MAE: 0.7923739237392373
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.14842526 0.17621982 0.0073     0.06424243 0.06396931]
Cross-Validation Mean: 0.09203136297474104
Cross-Validation Std Dev: 0.061655614534996156


# Bayesian Regression

In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Bayesian Ridge Regression model
model = BayesianRidge()

# Parameter tuning using RandomizedSearchCV
param_dist = {
    'alpha_1': np.logspace(-6, -1, 6),
    'alpha_2': np.logspace(-6, -1, 6),
    'lambda_1': np.logspace(-6, -1, 6),
    'lambda_2': np.logspace(-6, -1, 6)
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)


# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")

Best Parameters: {'lambda_2': 0.0001, 'lambda_1': 0.1, 'alpha_2': 0.1, 'alpha_1': 0.001}
R^2 Score: 0.46877192408957424
RMSE: 0.9021171128277098
MAE: 0.7487594495353221
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.4049201  0.38162817 0.39868496 0.45921714 0.41678268]
Cross-Validation Mean: 0.4122466075674997
Cross-Validation Std Dev: 0.026084062382817932


# Random Forest Regression

In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_excel('bert_embeddings_input_output_custom_columns.xlsx')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Random Forest Regression model
model = RandomForestRegressor()

# Parameter tuning using RandomizedSearchCV with fewer options
param_dist = {
    'n_estimators': [10, 50],  # Reduced number of estimators
    'max_depth': [None, 10],  # Reduced max_depth options
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]  # Keeping only one option for bootstrap
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)  # Reduced folds
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")


Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': True}
R^2 Score: 0.546014009902149
RMSE: 0.8339569585644221
MAE: 0.6424828383998126
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.48054989 0.50423682 0.47427179]
Cross-Validation Mean: 0.48635283077225305
Cross-Validation Std Dev: 0.012903006309018746
