## Training Roberta

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

# Function to load and preprocess data
def load_and_preprocess_data(url):
    data = pd.read_csv(url)
    X = data.drop(columns=['output']).values
    y = data['output'].values
    return X, y

# Function to standardize features
def standardize_features(X):
    scaler = StandardScaler()
    return scaler.fit_transform(X)

# Function to initialize models
def initialize_models():
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression": Pipeline([('scaler', StandardScaler()), 
                                          ('poly', PolynomialFeatures(degree=2)), 
                                          ('linear', LinearRegression())]),
        "Lasso Regression": Lasso(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

# Function to define hyperparameter grids for each model
def define_param_grids():
    param_grids = {
        "Linear Regression": {},
        "Polynomial Regression": {'poly__degree': [2, 3, 4]},
        "Lasso Regression": {'alpha': [0.01, 0.1, 1, 10]},
        "Bayesian Linear Regression": {'alpha_1': [1e-6, 1e-5, 1e-4], 'alpha_2': [1e-6, 1e-5, 1e-4]},
        "Support Vector Regression": {'C': [0.1, 1, 10], 'epsilon': [0.01, 0.1, 0.5]},
        "Decision Tree Regression": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
        "Gaussian Process Regression": {'alpha': [1e-2, 1e-3, 1e-4]},
        "Random Forest Regression": {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
        "KNN Regression": {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    }
    return param_grids

# Function to perform GridSearchCV
def grid_search_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Function to calculate accuracy based on threshold
def calculate_accuracy(y_pred, y_test, threshold=0.1):
    correct_predictions = np.abs(y_pred - y_test) <= threshold * np.abs(y_test)
    return np.mean(correct_predictions) * 100

# Main function to execute model training and evaluation
def train_and_evaluate_models(url):
    # Load and preprocess data
    X, y = load_and_preprocess_data(url)
    
    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize models and hyperparameter grids
    models = initialize_models()
    param_grids = define_param_grids()

    # Initialize results storage
    results = []

    # Loop through models and perform GridSearchCV
    for model_name, model in models.items():
        print(f"Running GridSearchCV for {model_name}...")
        
        try:
            # Perform grid search for hyperparameter tuning
            best_model, best_params = grid_search_model(model, param_grids.get(model_name, {}), X_train, y_train)

            # Make predictions using the best model
            y_pred = best_model.predict(X_test)

            # Calculate accuracy based on the threshold
            accuracy = calculate_accuracy(y_pred, y_test)

            # Append results
            results.append({
                'Model': model_name,
                'Best Params': best_params,
                'Accuracy': accuracy
            })
        
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            continue

    # Convert results to a DataFrame for display
    results_df = pd.DataFrame(results)

    # Print results sorted by accuracy
    print(results_df.sort_values(by="Accuracy", ascending=False))

# Run the model training and evaluation
train_and_evaluate_models('roberta_embeddings_input_output_custom_columns.csv')


Running GridSearchCV for Linear Regression...
Running GridSearchCV for Polynomial Regression...


20 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Running GridSearchCV for Lasso Regression...
Running GridSearchCV for Logistic Regression...
Running GridSearchCV for Bayesian Linear Regression...
Running GridSearchCV for Support Vector Regression...
Running GridSearchCV for Decision Tree Regression...
Running GridSearchCV for Gaussian Process Regression...
Running GridSearchCV for Random Forest Regression...
Running GridSearchCV for KNN Regression...
                         Model  \
6     Decision Tree Regression   
3          Logistic Regression   
9               KNN Regression   
0            Linear Regression   
7  Gaussian Process Regression   
1        Polynomial Regression   
8     Random Forest Regression   
4   Bayesian Linear Regression   
5    Support Vector Regression   
2             Lasso Regression   

                                         Best Params   Accuracy  
6        {'max_depth': None, 'min_samples_split': 2}  99.376947  
3                   {'C': 10, 'solver': 'liblinear'}  98.130841  
9          {'n_neigh

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

# Initialize results storage for additional metrics
additional_results = []

# Function to calculate RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to calculate R^2
def calculate_r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

# Function to calculate MAE
def calculate_mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

# Function to calculate MAPE
def calculate_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Function to calculate accuracy based on threshold
def calculate_accuracy(y_true, y_pred, threshold):
    correct_predictions = np.abs(y_pred - y_true) <= threshold * np.abs(y_true)
    return np.mean(correct_predictions) * 100  # Percentage of correct predictions

# Function to perform grid search and evaluation on models
def evaluate_models(models, param_grids, X, y, threshold):
    global additional_results

    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")

        # Apply GridSearchCV with 10-fold cross-validation
        grid_search = GridSearchCV(model, param_grids.get(model_name, {}), cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        
        # Get the best model and make predictions
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X)
        
        # Calculate evaluation metrics
        rmse = calculate_rmse(y, y_pred)
        r2 = calculate_r2(y, y_pred)
        mae = calculate_mae(y, y_pred)
        mape = calculate_mape(y, y_pred)
        accuracy = calculate_accuracy(y, y_pred, threshold)

        # Store the results
        additional_results.append({
            'Model': model_name,
            'Best Params': grid_search.best_params_,
            'Accuracy': accuracy,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae,
            'MAPE': mape
        })

# Function to calculate summary statistics for metrics
def calculate_metrics_summary():
    rmse_values = [result['RMSE'] for result in additional_results]
    r2_values = [result['R^2'] for result in additional_results]
    mae_values = [result['MAE'] for result in additional_results]
    mape_values = [result['MAPE'] for result in additional_results]
    
    return {
        'Mean RMSE': np.mean(rmse_values),
        'Std Dev RMSE': np.std(rmse_values),
        'Mean R^2': np.mean(r2_values),
        'Std Dev R^2': np.std(r2_values),
        'Mean MAE': np.mean(mae_values),
        'Std Dev MAE': np.std(mae_values),
        'Mean MAPE': np.mean(mape_values),
        'Std Dev MAPE': np.std(mape_values),
    }

# Function to print the evaluation results
def print_results():
    # Convert results to a DataFrame for display
    additional_results_df = pd.DataFrame(additional_results)

    # Calculate summary of metrics
    metrics_summary = calculate_metrics_summary()

    # Print individual model results sorted by accuracy
    print(additional_results_df.sort_values(by="Accuracy", ascending=False))

    # Print summary of metrics
    print("\nSummary of Metrics:")
    for metric, value in metrics_summary.items():
        print(f"{metric}: {value}")


Evaluating Linear Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Polynomial Regression...


20 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Evaluating Lasso Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Logistic Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Bayesian Linear Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Support Vector Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Decision Tree Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Gaussian Process Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Random Forest Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating KNN Regression...
                         Model  \
6     Decision Tree Regression   
3          Logistic Regression   
9               KNN Regression   
0            Linear Regression   
7  Gaussian Process Regression   
1        Polynomial Regression   
8     Random Forest Regression   
4   Bayesian Linear Regression   
5    Support Vector Regression   
2             Lasso Regression   

                                         Best Params   Accuracy      RMSE  \
6          {'max_depth': 30, 'min_samples_split': 2}  99.376947  0.078934   
3                   {'C': 10, 'solver': 'liblinear'}  98.130841  0.249610   
9          {'n_neighbors': 7, 'weights': 'distance'}  86.604361  0.078934   
0                                                 {}  66.355140  0.078934   
7                                  {'alpha': 0.0001}  64.174455  0.078935   
1                                {'poly__degree': 2}  61.682243  0.181429   
8  {'max_depth': 10, 'min_samples_split': 5, 'n_e...  28.

  mape = np.mean(np.abs((y - y_pred) / y)) * 100
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


## Testing roberta Q1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

# Function to load dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target
    return X, y

# Function to split dataset
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Function to define models
def define_models():
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "Lasso Regression": Lasso(),
        "Logistic Regression (used for binary targets)": LogisticRegression(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

# Function to train and evaluate models
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        try:
            # Fit the model
            model.fit(X_train, y_train)
            # Predict on the test set
            y_pred = model.predict(X_test)

            # Print results
            print(f"Model: {model_name}")
            print(f"Predicted Marks: {y_pred[:5]}")  # Print first 5 predictions
            print(f"Actual Marks: {y_test[:5].values}")  # Print first 5 actual values
            print("-" * 40)
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            print("-" * 40)

# Main function to execute the workflow
def main(file_path):
    # Load dataset
    X, y = load_data(file_path)
    
    # Split dataset into train and test sets
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Define models
    models = define_models()
    
    # Train and evaluate models
    train_and_evaluate_models(models, X_train, y_train, X_test, y_test)


main("Q1_roberta.csv")


Model: Linear Regression
Predicted Marks: [1.0289363  5.75741562 2.14466826 1.35928119 4.69795946]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Polynomial Regression (Degree 2)
Predicted Marks: [0.35320928 5.68454073 1.96005016 1.06915436 4.45912791]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Lasso Regression
Predicted Marks: [1.33864018 2.96700039 2.30919169 1.40855422 3.02428506]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Logistic Regression (used for binary targets)
Predicted Marks: [0 5 1 0 5]
Actual Marks: [0 4 2 1 5]
----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Bayesian Linear Regression
Predicted Marks: [1.13523047 5.26043009 1.90027108 0.87873819 4.46230376]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Support Vector Regression
Predicted Marks: [0.8408823  3.14526147 1.885041   0.90686789 3.04718351]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Decision Tree Regression
Predicted Marks: [0. 4. 2. 0. 5.]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Gaussian Process Regression
Predicted Marks: [ 1.92924038e-03  3.99802120e+00  1.38951095e+00 -1.71662313e-03
  4.37649848e+00]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Random Forest Regression
Predicted Marks: [1.14 4.35 1.44 1.17 4.4 ]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: KNN Regression
Predicted Marks: [0.2 4.4 2.6 0.8 4.2]
Actual Marks: [0 4 2 1 5]
----------------------------------------


## Testing roberta Q2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Function to load dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Function to handle missing values
def handle_missing_values(data):
    # Drop rows with NaN in the target column or fill them with the mean value
    data = data.dropna(subset=[data.columns[-1]])  # Drop rows where the target is NaN
    # Alternatively, you can fill missing values: data[data.columns[-1]].fillna(data[data.columns[-1]].mean(), inplace=True)
    return data

# Function to split dataset into features and target
def split_data(data):
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target
    return X, y

# Function to split data into training and test sets
def train_test_split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Function to define models
def define_models():
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "Lasso Regression": Lasso(),
        "Logistic Regression (used for binary targets)": LogisticRegression(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

# Function to train and evaluate models
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        try:
            # Fit the model
            model.fit(X_train, y_train)
            # Predict on the test set
            y_pred = model.predict(X_test)

            # Print results
            print(f"Model: {model_name}")
            print(f"Predicted Marks: {y_pred[:5]}")  # Print first 5 predictions
            print(f"Actual Marks: {y_test[:5].values}")  # Print first 5 actual values
            print("-" * 40)
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            print("-" * 40)

# Main function to execute the workflow
def main(file_path):
    # Load dataset
    data = load_data(file_path)
    
    # Handle missing values
    data = handle_missing_values(data)
    
    # Split into features and target
    X, y = split_data(data)
    
    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split_data(X, y)
    
    # Define models
    models = define_models()
    
    # Train and evaluate models
    train_and_evaluate_models(models, X_train, y_train, X_test, y_test)

# Run the workflow
main("Q2_roberta.csv")


Model: Linear Regression
Predicted Marks: [4.41583839 0.70612459 4.41583839 0.58907309 5.        ]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Polynomial Regression (Degree 2)
Predicted Marks: [4.35432203 0.65807689 4.35432203 0.62780126 5.        ]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Lasso Regression
Predicted Marks: [2.38095238 2.38095238 2.38095238 2.38095238 2.38095238]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Logistic Regression (used for binary targets)
Predicted Marks: [5. 0. 5. 0. 5.]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Bayesian Linear Regression
Predicted Marks: [4.41856281 0.64074635 4.41856281 0.61533833 4.99999996]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Support Vector Regression
Predicted Marks: [2.8728303  1.70818516 2.8728303  1.58295119 3.04024308]
Actual Marks: [4. 0. 4. 0. 

## Testing roberta Q3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Function to load dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Function to handle missing values
def handle_missing_values(data):
    # Drop rows with NaN in the target column or fill them with the mean value
    data = data.dropna(subset=[data.columns[-1]])  # Drop rows where the target is NaN
    # Alternatively: data[data.columns[-1]].fillna(data[data.columns[-1]].mean(), inplace=True)
    return data

# Function to split dataset into features and target
def split_data(data):
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target
    return X, y

# Function to split data into training and test sets
def train_test_split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Function to define models
def define_models():
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "Lasso Regression": Lasso(),
        "Logistic Regression (used for binary targets)": LogisticRegression(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

# Function to train and evaluate models
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        try:
            # Fit the model
            model.fit(X_train, y_train)
            # Predict on the test set
            y_pred = model.predict(X_test)

            # Convert predictions and actual values to integers
            y_pred_int = y_pred.astype(int)
            y_test_int = y_test.astype(int)

            # Print results
            print(f"Model: {model_name}")
            print(f"Predicted Marks: {list(y_pred_int[:5])}")  # Print first 5 predictions as integers
            print(f"Actual Marks: {list(y_test_int[:5].values)}")  # Print first 5 actual values as integers
            print("-" * 40)
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            print("-" * 40)

# Main function to execute the workflow
def main(file_path):
    # Load dataset
    data = load_data(file_path)
    
    # Handle missing values
    data = handle_missing_values(data)
    
    # Split into features and target
    X, y = split_data(data)
    
    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split_data(X, y)
    
    # Define models
    models = define_models()
    
    # Train and evaluate models
    train_and_evaluate_models(models, X_train, y_train, X_test, y_test)

# Run the workflow
main("Q3_roberta.csv")


Model: Linear Regression
Predicted Marks: [3, 2, 0, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Polynomial Regression (Degree 2)
Predicted Marks: [3, 2, 0, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Lasso Regression
Predicted Marks: [2, 2, 1, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Logistic Regression (used for binary targets)
Predicted Marks: [4, 1, 1, 1, 3]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Bayesian Linear Regression
Predicted Marks: [3, 2, 0, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Support Vector Regression
Predicted Marks: [2, 2, 1, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Decision Tree Regression
Predicted Marks: [3, 4, 1, 5, 1]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Gaussian Process Regression
Predicted Marks: [0, 0, 0, 0, 0]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Random Forest Regression
Predicted Marks: [2, 2, 1, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: KNN Regression
Predicted Marks: [1, 1, 0, 1, 1]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
