In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('bio.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'bio_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to bio_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('chemistry.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']  # Fixed typo: 'Ouput' -> 'Output'
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'chemistry_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to chemistry_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('civics.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'civics_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to civics_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('comp.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'comp_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to comp_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('english.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'english_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to english_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('geo.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'geo_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to geo_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('history.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'history_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to history_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('network.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'network_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to network_roberta.csv


In [None]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel

# Function to load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to load the pre-trained RoBERTa model and tokenizer
def load_roberta_model():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

# Function to compute RoBERTa embeddings for a given text
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    # Average token embeddings for simplicity
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.detach().numpy().flatten()

# Function to extract RoBERTa embeddings for a specific column in the DataFrame
def extract_embeddings(data, column_name, tokenizer, model):
    return data[column_name].apply(lambda text: get_roberta_embedding(str(text), tokenizer, model))

# Function to save the embeddings to a CSV file
def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f'Embeddings saved to {output_file}')

# Main function to orchestrate the process
def main():
    # Load data and model
    data = load_data('algebra.csv')
    tokenizer, model = load_roberta_model()
    
    # Extract embeddings and create a new DataFrame
    roberta_input_embeddings = extract_embeddings(data, 'Input', tokenizer, model)
    embeddings_df = pd.DataFrame(roberta_input_embeddings.tolist(), columns=[f'embed_{i}' for i in range(roberta_input_embeddings.iloc[0].size)])
    
    # Add the 'Output' column to the embeddings DataFrame
    embeddings_df['output'] = data['Output']
    
    # Save the embeddings to CSV
    save_embeddings(embeddings_df, 'algebra_roberta.csv')

# Run the main function
if __name__ == '__main__':
    main()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa embeddings saved to algebra_roberta.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline


def load_data(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)


def preprocess_data(data):
    """Preprocess the data: split into features and target."""
    X = data.drop(columns=['output'])
    y = data['output']
    return train_test_split(X, y, test_size=0.2, random_state=42)


def define_models():
    """Define regression models."""
    return {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([
            ('poly', PolynomialFeatures(degree=2)),
            ('linear', LinearRegression())
        ]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }


def define_param_grids():
    """Define hyperparameter grids for models."""
    return {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }


def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    """Evaluate a single model with optional hyperparameter tuning."""
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae


def evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test):
    """Evaluate all models in the provided dictionary."""
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")

        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return pd.DataFrame(results)


def print_model_results(results_df):
    """Print the evaluation results for all models."""
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")


def print_summary(results_df):
    """Print summary statistics of the model evaluations."""
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")


def main():
    """Main function to execute the pipeline."""
    # Load and preprocess data
    data = load_data('algebra_roberta.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)

    # Define models and parameters
    models = define_models()
    param_grids = define_param_grids()

    # Evaluate all models
    results_df = evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Print detailed results and summary
    print_model_results(results_df)
    print_summary(results_df)


if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 1.7013
R^2: 0.2282
MAE: 1.3251
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 1.6733
R^2: 0.2534
MAE: 1.2928
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.01}
RMSE: 1.7452
R^2: 0.1878
MAE: 1.4298
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 2.0917
R^2: -0.1667
MAE: 1.3750
----------------------------------------

----------------------------------------
Model: Bayesian Linear Regression
Best Params: None
RMSE: 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline


def load_data(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)


def preprocess_data(data):
    """Preprocess the data: split into features and target."""
    X = data.drop(columns=['output'])
    y = data['output']
    return train_test_split(X, y, test_size=0.2, random_state=42)


def define_models():
    """Define regression models."""
    return {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([
            ('poly', PolynomialFeatures(degree=2)),
            ('linear', LinearRegression())
        ]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }


def define_param_grids():
    """Define hyperparameter grids for models."""
    return {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }


def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    """Evaluate a single model with optional hyperparameter tuning."""
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae


def evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test):
    """Evaluate all models in the provided dictionary."""
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")

        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return pd.DataFrame(results)


def print_model_results(results_df):
    """Print the evaluation results for all models."""
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")


def print_summary(results_df):
    """Print summary statistics of the model evaluations."""
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")


def main():
    """Main function to execute the pipeline."""
    # Load and preprocess data
    data = load_data('bio_roberta.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)

    # Define models and parameters
    models = define_models()
    param_grids = define_param_grids()

    # Evaluate all models
    results_df = evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Print detailed results and summary
    print_model_results(results_df)
    print_summary(results_df)


if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 2.3197
R^2: 0.1031
MAE: 1.7811
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 2.4954
R^2: -0.0379
MAE: 1.8049
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.1}
RMSE: 2.7140
R^2: -0.2276
MAE: 2.5198
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 3.1623
R^2: -0.6667
MAE: 2.0000
--------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline


def load_data(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)


def preprocess_data(data):
    """Preprocess the data: split into features and target."""
    X = data.drop(columns=['output'])
    y = data['output']
    return train_test_split(X, y, test_size=0.2, random_state=42)


def define_models():
    """Define regression models."""
    return {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([
            ('poly', PolynomialFeatures(degree=2)),
            ('linear', LinearRegression())
        ]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }


def define_param_grids():
    """Define hyperparameter grids for models."""
    return {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }


def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    """Evaluate a single model with optional hyperparameter tuning."""
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae


def evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test):
    """Evaluate all models in the provided dictionary."""
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")

        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return pd.DataFrame(results)


def print_model_results(results_df):
    """Print the evaluation results for all models."""
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")


def print_summary(results_df):
    """Print summary statistics of the model evaluations."""
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")


def main():
    """Main function to execute the pipeline."""
    # Load and preprocess data
    data = load_data('chemistry_roberta.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)

    # Define models and parameters
    models = define_models()
    param_grids = define_param_grids()

    # Evaluate all models
    results_df = evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Print detailed results and summary
    print_model_results(results_df)
    print_summary(results_df)


if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 2.7192
R^2: -0.4253
MAE: 2.3546
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 2.6875
R^2: -0.3923
MAE: 2.3802
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.01}
RMSE: 2.1127
R^2: 0.1396
MAE: 2.0061
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 3.2404
R^2: -1.0241
MAE: 2.5000
-------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

def load_data(file_path):
    """Load dataset from a CSV file."""
    return pd.read_csv(file_path)

def preprocess_data(data):
    """Preprocess data: split into features and target."""
    X = data.drop(columns=['output'])
    y = data['output']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def define_models():
    """Define regression models."""
    return {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([
            ('poly', PolynomialFeatures(degree=2)),
            ('linear', LinearRegression())
        ]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }

def define_param_grids():
    """Define hyperparameter grids for models."""
    return {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }

def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    """Evaluate a single model with optional hyperparameter tuning."""
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    y_pred = best_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

def evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test):
    """Evaluate all models in the provided dictionary."""
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")

        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return pd.DataFrame(results)

def print_model_results(results_df):
    """Print the evaluation results for all models."""
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

def print_summary(results_df):
    """Print summary statistics of the model evaluations."""
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

def main():
    """Main function to execute the pipeline."""
    # Load and preprocess data
    data = load_data('civics_roberta.csv')
    X_train, X_test, y_train, y_test = preprocess_data(data)

    # Define models and parameters
    models = define_models()
    param_grids = define_param_grids()

    # Evaluate all models
    results_df = evaluate_all_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Print detailed results and summary
    print_model_results(results_df)
    print_summary(results_df)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 1.3554
R^2: 0.7060
MAE: 1.0210
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 1.2640
R^2: 0.7444
MAE: 1.0696
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.1}
RMSE: 2.1307
R^2: 0.2736
MAE: 1.8064
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 2.5000
R^2: 0.0000
MAE: 1.2500
-----------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Function to load dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    X = data.drop(columns=['output'])
    y = data['output']
    return X, y

# Function to split data into training and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Function to define models and hyperparameters
def define_models():
    models = {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())]),
        'Lasso Regression': Lasso(),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }
    
    param_grids = {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }
    
    return models, param_grids

# Function to perform grid search and model fitting
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test, param_grid=None):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

# Function to evaluate and store results for all models
def evaluate_models(models, param_grids, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, model_name, X_train, y_train, X_test, y_test, param_grid)

        # Store results
        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return results

# Function to summarize results
def summarize_results(results):
    results_df = pd.DataFrame(results)
    # Print metrics for each model
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

    # Print summary of metrics
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

# Main function to execute the workflow
def main():
    # Load data
    X, y = load_data('comp_roberta.csv')

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Define models and parameters
    models, param_grids = define_models()

    # Evaluate models
    results = evaluate_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Summarize results
    summarize_results(results)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 0.2820
R^2: 0.9810
MAE: 0.2122
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 0.2887
R^2: 0.9801
MAE: 0.2078
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.01}
RMSE: 0.5023
R^2: 0.9397
MAE: 0.4373
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 1.0000
R^2: 0.7612
MAE: 0.5000
----------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Function to load the dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    X = data.drop(columns=['output'])
    y = data['output']
    return X, y

# Function to split data into training and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to define models and their respective hyperparameters
def define_models():
    models = {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }
    
    param_grids = {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }
    
    return models, param_grids

# Function to evaluate a model and return its best estimator and metrics
def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

# Function to evaluate all models and collect results
def evaluate_models(models, param_grids, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        # Store results
        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return results

# Function to summarize and print the results
def summarize_results(results):
    results_df = pd.DataFrame(results)
    # Print metrics for each model
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

    # Print summary of metrics
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

# Main function to execute the workflow
def main():
    # Load data
    X, y = load_data('english_roberta.csv')

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Define models and parameters
    models, param_grids = define_models()

    # Evaluate models
    results = evaluate_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Summarize results
    summarize_results(results)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 14695204319.3341
R^2: -62193320636248539136.0000
MAE: 10383136626.0581
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 4133128005916.6470
R^2: -4919831168628244390871040.0000
MAE: 3286727818770.4727
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.1}
RMSE: 1.1726
R^2: 0.6040
MAE: 1.1345
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 1.7795
R^2: 0.0880
MAE: 1.1667
----------------------------------------

-------------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Function to load the dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    X = data.drop(columns=['output'])
    y = data['output']
    return X, y

# Function to split data into training and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to define models and their respective hyperparameters
def define_models():
    models = {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }

    param_grids = {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }

    return models, param_grids

# Function to evaluate a model and return its best estimator and metrics
def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

# Function to evaluate all models and collect results
def evaluate_models(models, param_grids, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        # Store results
        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return results

# Function to summarize and print the results
def summarize_results(results):
    results_df = pd.DataFrame(results)
    # Print metrics for each model
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

    # Print summary of metrics
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

# Main function to execute the workflow
def main():
    # Load data
    X, y = load_data('geo_roberta.csv')

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Define models and parameters
    models, param_grids = define_models()

    # Evaluate models
    results = evaluate_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Summarize results
    summarize_results(results)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 451119938494.8196
R^2: -46252090660811328978944.0000
MAE: 336609131456.5133
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 216014295918.0266
R^2: -10605040009309270835200.0000
MAE: 188686538045.5258
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.01}
RMSE: 2.1603
R^2: -0.0607
MAE: 1.7963
----------------------------------------

----------------------------------------
M

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Function to load the dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    X = data.drop(columns=['output'])
    y = data['output']
    return X, y

# Function to split data into training and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to define models and hyperparameters
def define_models():
    models = {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }

    param_grids = {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }

    return models, param_grids

# Function to evaluate a model and return its best estimator and metrics
def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

# Function to evaluate all models and collect results
def evaluate_models(models, param_grids, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        # Store results
        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return results

# Function to summarize and print the results
def summarize_results(results):
    results_df = pd.DataFrame(results)
    # Print metrics for each model
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

    # Print summary of metrics
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

# Main function to execute the workflow
def main():
    # Load data
    X, y = load_data('geo_roberta.csv')

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Define models and parameters
    models, param_grids = define_models()

    # Evaluate models
    results = evaluate_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Summarize results
    summarize_results(results)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 15278220200.7725
R^2: -86855446512853368832.0000
MAE: 10158646861.3898
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 2368598497769.4995
R^2: -2087538174376159653920768.0000
MAE: 1688828727306.1777
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.01}
RMSE: 2.1645
R^2: -0.7433
MAE: 1.2459
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 2.2638
R^2: -0.9070
MAE: 1.1250
----------------------------------------

----------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Function to load the dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    X = data.drop(columns=['output'])
    y = data['output']
    return X, y

# Function to split data into training and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to define models and hyperparameters
def define_models():
    models = {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }

    param_grids = {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }

    return models, param_grids

# Function to evaluate a model and return its best estimator and metrics
def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

# Function to evaluate all models and collect results
def evaluate_models(models, param_grids, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        # Store results
        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return results

# Function to summarize and print the results
def summarize_results(results):
    results_df = pd.DataFrame(results)
    # Print metrics for each model
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

    # Print summary of metrics
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

# Main function to execute the workflow
def main():
    # Load data
    X, y = load_data('history_roberta.csv')

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Define models and parameters
    models, param_grids = define_models()

    # Evaluate models
    results = evaluate_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Summarize results
    summarize_results(results)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 0.7770
R^2: 0.1217
MAE: 0.3885
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 0.7082
R^2: 0.2704
MAE: 0.3541
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.1}
RMSE: 1.4327
R^2: -1.9858
MAE: 1.3654
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 1.5000
R^2: -2.2727
MAE: 0.7500
---------

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

# Function to load the dataset
def load_data(filepath):
    data = pd.read_csv(filepath)
    X = data.drop(columns=['output'])
    y = data['output']
    return X, y

# Function to split data into training and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to define models and their respective hyperparameters
def define_models():
    models = {
        'Linear Regression': LinearRegression(),
        'Polynomial Regression': Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression())]),
        'Lasso Regression': Lasso(),
        'Logistic Regression': LogisticRegression(),
        'Bayesian Linear Regression': BayesianRidge(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Gaussian Process Regression': GaussianProcessRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'KNN Regression': KNeighborsRegressor()
    }

    param_grids = {
        'Lasso Regression': {'alpha': [0.01, 0.1, 1, 10]},
        'Support Vector Regression': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'Decision Tree Regression': {'max_depth': [3, 5, 10, None]},
        'Random Forest Regression': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 10, None]},
        'KNN Regression': {'n_neighbors': [3, 5, 10]},
        'Gaussian Process Regression': {'alpha': [1e-10, 1e-5, 1e-2]},
    }

    return models, param_grids

# Function to evaluate each model with GridSearchCV where applicable
def evaluate_model(model, X_train, y_train, X_test, y_test, param_grid=None):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = None

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return best_model, best_params, rmse, r2, mae

# Function to evaluate all models and collect results
def evaluate_models(models, param_grids, X_train, y_train, X_test, y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        param_grid = param_grids.get(model_name, None)
        best_model, best_params, rmse, r2, mae = evaluate_model(model, X_train, y_train, X_test, y_test, param_grid)

        # Store results
        results.append({
            'Model': model_name,
            'Best Params': best_params,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae
        })
    
    return results

# Function to summarize and print the results
def summarize_results(results):
    results_df = pd.DataFrame(results)
    # Print metrics for each model
    for index, row in results_df.iterrows():
        print("\n----------------------------------------")
        print(f"Model: {row['Model']}")
        print(f"Best Params: {row['Best Params']}")
        print(f"RMSE: {row['RMSE']:.4f}")
        print(f"R^2: {row['R^2']:.4f}")
        print(f"MAE: {row['MAE']:.4f}")
        print("----------------------------------------")

    # Print summary of metrics
    print("\nSummary of Metrics:")
    print(f"Mean RMSE: {results_df['RMSE'].mean():.4f}")
    print(f"Std Dev RMSE: {results_df['RMSE'].std():.4f}")
    print(f"Mean R^2: {results_df['R^2'].mean():.4f}")
    print(f"Std Dev R^2: {results_df['R^2'].std():.4f}")
    print(f"Mean MAE: {results_df['MAE'].mean():.4f}")
    print(f"Std Dev MAE: {results_df['MAE'].std():.4f}")

# Main function to execute the workflow
def main():
    # Load data
    X, y = load_data('network_roberta.csv')

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Define models and parameters
    models, param_grids = define_models()

    # Evaluate models
    results = evaluate_models(models, param_grids, X_train, y_train, X_test, y_test)

    # Summarize results
    summarize_results(results)

if __name__ == "__main__":
    main()


Evaluating Linear Regression...
Evaluating Polynomial Regression...
Evaluating Lasso Regression...
Evaluating Logistic Regression...
Evaluating Bayesian Linear Regression...
Evaluating Support Vector Regression...
Evaluating Decision Tree Regression...
Evaluating Gaussian Process Regression...
Evaluating Random Forest Regression...
Evaluating KNN Regression...

----------------------------------------
Model: Linear Regression
Best Params: None
RMSE: 1.3757
R^2: 0.6972
MAE: 1.1928
----------------------------------------

----------------------------------------
Model: Polynomial Regression
Best Params: None
RMSE: 1.5498
R^2: 0.6157
MAE: 1.3559
----------------------------------------

----------------------------------------
Model: Lasso Regression
Best Params: {'alpha': 0.1}
RMSE: 1.6319
R^2: 0.5739
MAE: 1.4316
----------------------------------------

----------------------------------------
Model: Logistic Regression
Best Params: None
RMSE: 0.0000
R^2: 1.0000
MAE: 0.0000
-----------