# Linear Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df = pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Linear Regression model
model = LinearRegression()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search (though linear regression doesn't have hyperparameters, this is for extensibility)
grid_params = {}  # No hyperparameters for basic LinearRegression
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

# Ridge Regression

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df = pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Ridge Regression model
model = Ridge()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search for hyperparameter tuning (regularization strength 'alpha')
grid_params = {'alpha': [0.01, 0.1, 1, 10, 100]}  # You can add more values if needed
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"Best alpha (regularization strength): {grid_search.best_params_['alpha']}")
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

# Lasso Regression

In [8]:
'''Lasso Regression'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df = pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Lasso Regression model
model = Lasso()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search for hyperparameter tuning (regularization strength 'alpha')
grid_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}  # You can add more values if needed
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"Best alpha (regularization strength): {grid_search.best_params_['alpha']}")
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best alpha (regularization strength): 0.01
MSE: 0.9328511603607741
RMSE: 0.9658422026194414
R²: 0.33879625535420954
Standard Deviation of MSE: 0.15535334313966961


# KNN

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset from Excel file
data= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Separate features (embeddings) and output
X = data.drop(columns=['output'])
y = data['output']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Set up the K-Nearest Neighbors Regressor
knn = KNeighborsRegressor()

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

# Set up the grid search with 10-fold cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error', verbose=3)

# Perform grid search using scaled data
grid_search.fit(X_train_scaled, y_train)

# Get the best estimator from the grid search
best_knn = grid_search.best_estimator_

# Evaluate on the test set
y_pred_test = best_knn.predict(X_test_scaled)

# Calculate MSE and RMSE on the test set
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)

# Calculate standard deviation of predictions on the test set
std_dev_test = np.std(y_pred_test)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'Mean Squared Error (MSE) on Test Set: {mse_test:.4f}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse_test:.4f}')
print(f'Standard Deviation of Predictions on Test Set: {std_dev_test:.4f}')

# Calculate R² value on the test set
r2_test = r2_score(y_test, y_pred_test)
print(f"R² value on Test Set: {r2_test:.4f}")

# 10-Fold Cross-Validation
cv_scores = cross_val_score(best_knn, scaler.fit_transform(X), y, cv=10, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
cv_mse = -cv_scores  # This gives you the MSE scores

print(f'Cross-Validation MSE: {cv_mse.mean():.4f} ± {cv_mse.std():.4f}')
print(f'Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}')


Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV 1/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.767 total time=   0.4s
[CV 2/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.820 total time=   0.0s
[CV 3/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.942 total time=   0.0s
[CV 4/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-1.047 total time=   0.0s
[CV 5/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-1.235 total time=   0.0s
[CV 6/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.761 total time=   0.1s
[CV 7/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.722 total time=   0.0s
[CV 8/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.711 total time=   0.0s
[CV 9/10] END algorithm=auto, n_neighbors=3, p=1, weights=uniform;, score=-0.750 total time=   0.0s
[CV 10/10] END algorithm=auto, n_neig

# Polynomial Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset (replace 'gpt2_embeddings_input_output_custom_columns.csv' with the actual file)
data= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Separate features (embeddings) and output (marks)
X = data.drop(columns=['output'])
y = data['output']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with PolynomialFeatures and LinearRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('poly', PolynomialFeatures()),  # Polynomial features transformation
    ('linear', LinearRegression())  # Linear regression model
])

# Define the parameter grid for GridSearchCV (removed 'normalize' parameter)
param_grid = {
    'poly__degree': [2, 3, 4],  # Different degrees of polynomial features
    'linear__fit_intercept': [True, False]  # Fit intercept or not
}

# Set up 10-fold cross-validation
cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=2)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate R²
r2 = r2_score(y_test, y_pred)
# Calculate the standard deviation of the predictions
std_dev = np.std(y_pred)

# Print the results
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Root Mean Squared Error (RMSE) on Test Set: {rmse}')
print(f'R² value on Test Set: {r2}')
print(f'Standard Deviation of Predictions: {std_dev}')


Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time= 1.1min
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  58.9s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  55.1s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  55.9s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  55.2s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  47.5s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  48.6s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  47.4s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  52.3s
[CV] END .........linear__fit_intercept=True, poly__degree=2; total time=  51.4s
[CV] END .........linear__fit_intercept=True, poly__degree=3; total time=   0.0s
[CV] END .........linear__fit_intercept=True, po

40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Best Parameters: {'linear__fit_intercept': True, 'poly__degree': 2}
Root Mean Squared Error (RMSE) on Test Set: 77570434032.98264
R² value on Test Set: -3.927784212579305e+21
Standard Deviation of Predictions: 76796804149.78


# Logistic Regression

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset (replace with the actual file or DataFrame)
# Assuming the dataset is loaded as a pandas DataFrame 'df'
df= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Drop 'output' column for X, and separate it as y
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets (optional for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Grid Search for hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l2'],  # l2 regularization for Logistic Regression
}

# Perform 10-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best parameters found by Grid Search
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score: ", r2)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)

# Calculate the standard deviation of the predictions
std_dev = np.std(y_pred)

# Cross-validation RMSE
cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
cv_rmse = np.mean(np.sqrt(-cross_val_scores))
print("Cross-validated RMSE: ", cv_rmse)
print(f'Standard Deviation of Predictions: {std_dev}')


Best parameters found:  {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
R^2 Score:  0.352054964980268
RMSE:  0.9963031296745293
Cross-validated RMSE:  0.9779680670303673
Standard Deviation of Predictions: 1.1961071190749342


# Linear Regression

In [5]:
'''Linear Regression'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Load your dataset (assuming it's an Excel file as provided)
df= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')


# Display basic info to ensure the data is loaded correctly
print(df.head())
print(df.columns)

# Separate the features (embeddings) and target (output)
X = df.drop(columns=['output']).values
y = df['output'].values

# Normalize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the Linear Regression model
model = LinearRegression()

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform Grid Search (though linear regression doesn't have hyperparameters, this is for extensibility)
grid_params = {}  # No hyperparameters for basic LinearRegression
grid_search = GridSearchCV(model, grid_params, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# To speed up computation, let's reduce the dataset size by sampling 500 rows
df_sample = df.sample(n=500, random_state=42)
X_sample = df_sample.drop(columns=['output']).values
y_sample = df_sample['output'].values

# Normalize the sample data
X_sample_scaled = scaler.fit_transform(X_sample)

# Perform Grid Search and Cross Validation on the sample
grid_search.fit(X_sample_scaled, y_sample)

# Get the best estimator for the sample
best_model_sample = grid_search.best_estimator_

# Perform cross-validation
mse_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='neg_mean_squared_error')
r2_scores_sample = cross_val_score(best_model_sample, X_sample_scaled, y_sample, cv=kf, scoring='r2')

# Calculate metrics for the sample
mse_sample = -mse_scores_sample.mean()  # MSE (convert from negative MSE)
rmse_sample = np.sqrt(mse_sample)       # RMSE
r2_sample = r2_scores_sample.mean()     # R²
std_dev_sample = np.std(mse_scores_sample)  # Standard deviation of MSE

# Output the metrics
print(f"MSE: {mse_sample}")
print(f"RMSE: {rmse_sample}")
print(f"R²: {r2_sample}")
print(f"Standard Deviation of MSE: {std_dev_sample}")


    embed_0   embed_1   embed_2   embed_3   embed_4   embed_5   embed_6  \
0 -0.002268 -0.057392 -0.051938 -0.254064 -0.107042 -0.005746  0.053339   
1 -0.095116  0.002254 -0.080440 -0.032033  0.325927 -0.089124  0.060066   
2  0.171062 -0.006862 -0.092904  0.191909 -0.059166  0.084494  0.057149   
3  0.091722 -0.023084 -0.170847  0.275267 -0.098605 -0.062822  0.026235   
4  0.058596 -0.003751 -0.051466  0.243470  0.019038 -0.036104 -0.019322   

    embed_7   embed_8   embed_9  ...  embed_759  embed_760  embed_761  \
0  0.155956  0.116585 -0.038332  ...   0.072443   0.059975  -0.076614   
1  0.011424  0.037144 -0.126206  ...   0.060893  -0.161714  -0.016327   
2 -0.057770  0.018334  0.042601  ...  -0.011123   0.010387   0.074528   
3 -0.048630 -0.013406 -0.024388  ...  -0.104926   0.042281   0.022375   
4 -0.011959 -0.054584 -0.058981  ...  -0.159260   0.032556  -0.005767   

   embed_762  embed_763  embed_764  embed_765  embed_766  embed_767  output  
0   0.036943   0.130576   0.4098

# Decision Tree Regression

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Decision Tree Regression model
model = DecisionTreeRegressor()

# Parameter tuning using RandomizedSearchCV
param_dist = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 5),
    'max_features': ['auto', 'sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")

Best Parameters: {'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20}
R^2 Score: 0.27132947089194637
RMSE: 1.0565450358948874
MAE: 0.682410824108241
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [-0.06341399 -0.05463411  0.03253953 -0.08660526 -0.13790409]
Cross-Validation Mean: -0.062003585403908226
Cross-Validation Std Dev: 0.055427119347098035


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise Inva

# Bayesian Regression

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Bayesian Ridge Regression model
model = BayesianRidge()

# Parameter tuning using RandomizedSearchCV
param_dist = {
    'alpha_1': np.logspace(-6, -1, 6),
    'alpha_2': np.logspace(-6, -1, 6),
    'lambda_1': np.logspace(-6, -1, 6),
    'lambda_2': np.logspace(-6, -1, 6)
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)


# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")

Best Parameters: {'lambda_2': 0.0001, 'lambda_1': 0.1, 'alpha_2': 0.1, 'alpha_1': 0.001}
R^2 Score: 0.43316623718084224
RMSE: 0.931859134434971
MAE: 0.7654710621707257
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.39647098 0.37840157 0.45102912 0.44629659 0.41795508]
Cross-Validation Mean: 0.4180306685016288
Cross-Validation Std Dev: 0.028011251967096144


# Random Forest Regression

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

# Load the dataset
df= pd.read_csv('roberta_embeddings_input_output_custom_columns.csv')

# Features and target
X = df.drop(columns=['output'])
y = df['output']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimension reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Random Forest Regression model
model = RandomForestRegressor()

# Parameter tuning using RandomizedSearchCV with fewer options
param_dist = {
    'n_estimators': [10, 50],  # Reduced number of estimators
    'max_depth': [None, 10],  # Reduced max_depth options
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]  # Keeping only one option for bootstrap
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)  # Reduced folds
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kf)

# Predictions
y_pred = random_search.best_estimator_.predict(X_test)

# Performance metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mean_value = np.mean(y_test)
std_dev = np.std(y_test)

# Print results
print(f"Best Parameters: {random_search.best_params_}")
print(f"R^2 Score: {r2}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Mean of Output: {mean_value}")
print(f"Standard Deviation of Output: {std_dev}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean: {cv_scores.mean()}")
print(f"Cross-Validation Std Dev: {cv_scores.std()}")


Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}
R^2 Score: 0.5354746999113791
RMSE: 0.8435815984811348
MAE: 0.6583421484214842
Mean of Output: 3.4022140221402215
Standard Deviation of Output: 1.2377200077817099
Cross-Validation Scores: [0.46013964 0.49983669 0.44386573]
Cross-Validation Mean: 0.46794735299501705
Cross-Validation Std Dev: 0.023507547769574297
