In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# List of regression models up to Decision Tree
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('KNN', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor())
]

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 10 folds
    cv_scores_rmse = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=10, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Hyperparameter tuning using GridSearchCV for the selected models
param_grids = {
    'Linear Regression': {},  # No hyperparameters for linear regression
    'Ridge Regression': {'alpha': [0.1, 1, 10, 100]},
    'Lasso Regression': {'alpha': [0.1, 1, 10]},
    'KNN': {'n_neighbors': [3, 5, 10, 15], 'weights': ['uniform', 'distance']},
    'Decision Tree': {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]}
}

# Perform hyperparameter tuning and evaluation for each model
for name, model in models:
    print(f"Hyperparameter tuning for {name}...")
    param_grid = param_grids.get(name, {})

    # Skip models with no parameters to tune
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_
        print(f"Best {name} model: {grid_search.best_params_}")
    else:
        best_model = model

    # Evaluate the model
    rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X, y)

    print(f"CV Mean RMSE: {rmse_mean}, CV RMSE Std: {rmse_std}")
    print(f"CV Mean R2: {r2_mean}, CV R2 Std: {r2_std}\n")


Hyperparameter tuning for Linear Regression...
CV Mean RMSE: 0.8653085542331802, CV RMSE Std: 0.46879283271209293
CV Mean R2: 0.5901627594638144, CV R2 Std: 0.3367900909286893

Hyperparameter tuning for Ridge Regression...
Best Ridge Regression model: {'alpha': 100}
CV Mean RMSE: 0.26286051543572464, CV RMSE Std: 0.051213021815300214
CV Mean R2: 0.886753553352975, CV R2 Std: 0.01932515888041925

Hyperparameter tuning for Lasso Regression...
Best Lasso Regression model: {'alpha': 0.1}
CV Mean RMSE: 0.5124258169772276, CV RMSE Std: 0.10390199816102748
CV Mean R2: 0.7808041454578976, CV R2 Std: 0.03322863547599528

Hyperparameter tuning for KNN...
Best KNN model: {'n_neighbors': 10, 'weights': 'distance'}
CV Mean RMSE: 0.3605547823079171, CV RMSE Std: 0.23021319749447408
CV Mean R2: 0.8262177045452177, CV R2 Std: 0.16251222374733726

Hyperparameter tuning for Decision Tree...
Best Decision Tree model: {'max_depth': 20, 'min_samples_split': 2}
CV Mean RMSE: 0.5057582384424929, CV RMSE Std:

In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# XGBoost model
model = XGBRegressor(random_state=42, n_jobs=-1)

# Reduced hyperparameter tuning grid (fewer combinations)
param_grid = {
    'n_estimators': [100, 150],               # Fewer estimators
    'learning_rate': [0.01, 0.1],             # Limited learning rates
    'max_depth': [3, 5],                       # Shallower trees
    'subsample': [0.7, 0.8],                   # Slightly lower subsample values
    'colsample_bytree': [0.8, 1.0]             # Adjusted column sampling
}

# Hyperparameter tuning and evaluation with fewer cross-validation folds
print(f"Hyperparameter tuning for XGBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced CV folds
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f"Best XGBoost model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with fewer folds
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Evaluate the best model
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X, y)

print(f"CV Mean RMSE: {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R2: {r2_mean}, CV R2 Std: {r2_std}\n")


Hyperparameter tuning for XGBoost...
Best XGBoost model: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.7}
CV Mean RMSE: 0.20369034762719052, CV RMSE Std: 0.045111077488425834
CV Mean R2: 0.9128446817398072, CV R2 Std: 0.025940598537425915



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# AdaBoost model
model = AdaBoostRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],  # Reduced options for faster execution
    'learning_rate': [0.01, 0.1],  # Reduced learning rate options
    'loss': ['linear', 'square']  # Reduced loss functions for faster testing
}

# Hyperparameter tuning and evaluation
print(f"Hyperparameter tuning for AdaBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f"Best AdaBoost model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 5 folds to speed up (instead of 10)
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Evaluate the best model
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X, y)

print(f"CV Mean RMSE: {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R2: {r2_mean}, CV R2 Std: {r2_std}\n")


Hyperparameter tuning for AdaBoost...
Best AdaBoost model: {'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 100}
CV Mean RMSE: 0.4930006583902896, CV RMSE Std: 0.03842519655156016
CV Mean R2: 0.7909816830160323, CV R2 Std: 0.029356583648235506



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],  # Reduced options for faster execution
    'learning_rate': [0.01, 0.1],  # Reduced learning rate options
    'max_depth': [3, 5],  # Reduced depth for faster training
    'subsample': [0.7, 0.8]  # Reduced subsample options
}

# Hyperparameter tuning and evaluation
print(f"Hyperparameter tuning for Gradient Boosting...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f"Best Gradient Boosting model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 5 folds to speed up (instead of 10)
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Evaluate the best model
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X, y)

print(f"CV Mean RMSE: {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R2: {r2_mean}, CV R2 Std: {r2_std}\n")


Hyperparameter tuning for Gradient Boosting...
Best Gradient Boosting model: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}
CV Mean RMSE: 0.2152094033469004, CV RMSE Std: 0.04459764028153108
CV Mean R2: 0.907941296125659, CV R2 Std: 0.026326853456473676



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Random Forest model
model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],  # Reduced estimators for faster execution
    'max_depth': [None, 10, 20],  # Reduced depth options
    'min_samples_split': [2, 5],  # Limited to lower values for faster computation
    'min_samples_leaf': [1, 2],   # Limited leaf size options
    'bootstrap': [True]           # No need to test False for bootstrap
}

# Hyperparameter tuning and evaluation
print(f"Hyperparameter tuning for Random Forest...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f"Best Random Forest model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 5 folds to speed up (instead of 10)
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Evaluate the best model
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X, y)

print(f"CV Mean RMSE: {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R2: {r2_mean}, CV R2 Std: {r2_std}\n")


Hyperparameter tuning for Random Forest...
Best Random Forest model: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
CV Mean RMSE: 0.2507288136334071, CV RMSE Std: 0.060153943144596204
CV Mean R2: 0.8924450121396921, CV R2 Std: 0.03486003338295441



In [6]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# SVR model
model = SVR()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],  # Reduced values for faster training
    'kernel': ['linear', 'rbf'],  # Focus on common kernels
    'gamma': ['scale'],  # Reduced gamma values (common choice)
}

# Hyperparameter tuning and evaluation
print(f"Hyperparameter tuning for SVR...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X, y)
best_model = grid_search.best_estimator_
print(f"Best SVR model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 5 folds to speed up (instead of 10)
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Evaluate the best model
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X, y)

print(f"CV Mean RMSE: {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R2: {r2_mean}, CV R2 Std: {r2_std}\n")


Hyperparameter tuning for SVR...
Best SVR model: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
CV Mean RMSE: 0.16983354017960037, CV RMSE Std: 0.04492317753261408
CV Mean R2: 0.9291665811010639, CV R2 Std: 0.015427305885617137

