## GPT2 training 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, BayesianRidge, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

def load_data(url):
    """
    Load dataset from the given URL.
    """
    data = pd.read_csv(url)
    X = data.drop(columns=['output']).values
    y = data['output'].values
    return X, y

def initialize_models():
    """
    Initialize regression models.
    """
    scaler = StandardScaler()
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression": Pipeline([('scaler', StandardScaler()), 
                                          ('poly', PolynomialFeatures(degree=2)), 
                                          ('linear', LinearRegression())]),
        "Lasso Regression": Lasso(),
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

def initialize_param_grids():
    """
    Initialize hyperparameter grids for GridSearchCV.
    """
    param_grids = {
        "Linear Regression": {},
        "Polynomial Regression": {'poly__degree': [2, 3, 4]},
        "Lasso Regression": {'alpha': [0.01, 0.1, 1, 10]},
        "Logistic Regression": {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']},
        "Bayesian Linear Regression": {'alpha_1': [1e-6, 1e-5, 1e-4], 'alpha_2': [1e-6, 1e-5, 1e-4]},
        "Support Vector Regression": {'C': [0.1, 1, 10], 'epsilon': [0.01, 0.1, 0.5]},
        "Decision Tree Regression": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]},
        "Gaussian Process Regression": {'alpha': [1e-2, 1e-3, 1e-4]},
        "Random Forest Regression": {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
        "KNN Regression": {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    }
    return param_grids

def run_grid_search(models, param_grids, X, y, threshold=0.1):
    """
    Run GridSearchCV for each model and return the results.
    """
    results = []
    for model_name, model in models.items():
        print(f"Running GridSearchCV for {model_name}...")

        grid_search = GridSearchCV(model, param_grids.get(model_name, {}), cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)

        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X)

        # Calculate accuracy based on threshold
        correct_predictions = np.abs(y_pred - y) <= threshold * np.abs(y)
        accuracy = np.mean(correct_predictions) * 100  # Percentage of correct predictions

        # Append results
        results.append({
            'Model': model_name,
            'Best Params': grid_search.best_params_,
            'Accuracy': accuracy
        })

    return results

def main():
    """
    Main function to load data, initialize models, run GridSearchCV, and display results.
    """
    # Load data
    url = 'gpt2_embeddings_input_output_custom_columns'
    X, y = load_data(url)

    # Initialize models and parameter grids
    models = initialize_models()
    param_grids = initialize_param_grids()

    # Run GridSearchCV and collect results
    results = run_grid_search(models, param_grids, X, y)

    # Convert results to a DataFrame and display sorted results
    results_df = pd.DataFrame(results)
    print(results_df.sort_values(by="Accuracy", ascending=False))

# Call main to execute the workflow
if __name__ == "__main__":
    main()


Running GridSearchCV for Linear Regression...
Running GridSearchCV for Polynomial Regression...


20 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Running GridSearchCV for Lasso Regression...
Running GridSearchCV for Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Running GridSearchCV for Bayesian Linear Regression...
Running GridSearchCV for Support Vector Regression...
Running GridSearchCV for Decision Tree Regression...
Running GridSearchCV for Gaussian Process Regression...
Running GridSearchCV for Random Forest Regression...
Running GridSearchCV for KNN Regression...
                         Model  \
3          Logistic Regression   
6     Decision Tree Regression   
9               KNN Regression   
7  Gaussian Process Regression   
0            Linear Regression   
1        Polynomial Regression   
8     Random Forest Regression   
4   Bayesian Linear Regression   
2             Lasso Regression   
5    Support Vector Regression   

                                         Best Params   Accuracy  
3                       {'C': 10, 'solver': 'lbfgs'}  98.753894  
6          {'max_depth': 20, 'min_samples_split': 5}  90.342679  
9          {'n_neighbors': 7, 'weights': 'distance'}  86.292835  
7                                    {'alpha': 

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_model_performance(models, param_grids, X, y, threshold=0.1):
    """
    Evaluate models using GridSearchCV and calculate various metrics like RMSE, R^2, MAE, MAPE, and accuracy.
    """
    additional_results = []

    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")

        # Apply GridSearchCV with 10-fold cross-validation
        grid_search = GridSearchCV(model, param_grids.get(model_name, {}), cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        
        # Get the best model and make predictions
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        r2 = r2_score(y, y_pred)
        mae = mean_absolute_error(y, y_pred)
        mape = np.mean(np.abs((y - y_pred) / y)) * 100
        
        # Calculate accuracy based on threshold (±10% of the actual value)
        correct_predictions = np.abs(y_pred - y) <= threshold * np.abs(y)
        accuracy = np.mean(correct_predictions) * 100  # Percentage of correct predictions

        # Store the results
        additional_results.append({
            'Model': model_name,
            'Best Params': grid_search.best_params_,
            'Accuracy': accuracy,
            'RMSE': rmse,
            'R^2': r2,
            'MAE': mae,
            'MAPE': mape
        })

    return additional_results


def summarize_metrics(additional_results):
    """
    Calculate and print the summary of metrics like mean and standard deviation.
    """
    metrics_summary = {
        'Mean RMSE': np.mean([result['RMSE'] for result in additional_results]),
        'Std Dev RMSE': np.std([result['RMSE'] for result in additional_results]),
        'Mean R^2': np.mean([result['R^2'] for result in additional_results]),
        'Std Dev R^2': np.std([result['R^2'] for result in additional_results]),
        'Mean MAE': np.mean([result['MAE'] for result in additional_results]),
        'Std Dev MAE': np.std([result['MAE'] for result in additional_results]),
        'Mean MAPE': np.mean([result['MAPE'] for result in additional_results]),
        'Std Dev MAPE': np.std([result['MAPE'] for result in additional_results]),
    }

    print("\nSummary of Metrics:")
    for metric, value in metrics_summary.items():
        print(f"{metric}: {value}")


def main():
    """
    Main function to evaluate models, print results, and display the summary of metrics.
    """
    # Assuming 'models', 'param_grids', 'X', 'y' are already defined
    
    # Evaluate models and get the results
    additional_results = evaluate_model_performance(models, param_grids, X, y)

    # Convert results to DataFrame for display
    additional_results_df = pd.DataFrame(additional_results)

    # Print individual model results sorted by accuracy
    print(additional_results_df.sort_values(by="Accuracy", ascending=False))

    # Summarize and print the metrics
    summarize_metrics(additional_results)


# Call main function to run the code
if __name__ == "__main__":
    main()


Evaluating Linear Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Polynomial Regression...


20 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-packages\sklearn\pipeline.py", line 471, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Jyoshitha\anaconda\Lib\site-pa

Evaluating Lasso Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Bayesian Linear Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Support Vector Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Decision Tree Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Gaussian Process Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating Random Forest Regression...


  mape = np.mean(np.abs((y - y_pred) / y)) * 100


Evaluating KNN Regression...
                         Model  \
3          Logistic Regression   
9               KNN Regression   
6     Decision Tree Regression   
7  Gaussian Process Regression   
0            Linear Regression   
1        Polynomial Regression   
8     Random Forest Regression   
4   Bayesian Linear Regression   
2             Lasso Regression   
5    Support Vector Regression   

                                         Best Params   Accuracy      RMSE  \
3                       {'C': 10, 'solver': 'lbfgs'}  98.753894  0.176501   
9          {'n_neighbors': 7, 'weights': 'distance'}  86.292835  0.078934   
6          {'max_depth': 10, 'min_samples_split': 5}  67.912773  0.241037   
7                                    {'alpha': 0.01}  65.109034  0.083274   
0                                                 {}  64.174455  0.078934   
1                                {'poly__degree': 2}  57.943925  0.822433   
8  {'max_depth': None, 'min_samples_split': 2, 'n...  25.

  mape = np.mean(np.abs((y - y_pred) / y)) * 100
  mape = np.mean(np.abs((y - y_pred) / y)) * 100


## GPT2 tetsing Q1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

# Load and preprocess data
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
def define_models():
    return {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "Lasso Regression": Lasso(),
        "Logistic Regression (used for binary targets)": LogisticRegression(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }

# Train and evaluate models
def train_and_evaluate_models(models, X_train, X_test, y_train, y_test):
    for model_name, model in models.items():
        try:
            # Fit the model
            model.fit(X_train, y_train)
            # Predict on the test set
            y_pred = model.predict(X_test)
            
            # Calculate Mean Squared Error
            mse = mean_squared_error(y_test, y_pred)
            
            # Print results
            print(f"Model: {model_name}")
            print(f"Predicted Marks: {y_pred[:5]}")  # Print first 5 predictions
            print(f"Actual Marks: {y_test[:5].values}")  # Print first 5 actual values
            print(f"Mean Squared Error: {mse}")  # Print MSE for the model
            print("-" * 40)
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            print("-" * 40)

# Main function to execute the workflow
def main():
    file_path = "Q1_gpt2.csv"  # Specify your dataset path
    X_train, X_test, y_train, y_test = load_and_preprocess_data(file_path)
    models = define_models()
    train_and_evaluate_models(models, X_train, X_test, y_train, y_test)

# Run the main function
if __name__ == "__main__":
    main()


Model: Linear Regression
Predicted Marks: [-9.02109172  3.97427566  5.19716771  3.40920753  3.73993152]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Polynomial Regression (Degree 2)
Predicted Marks: [-4.40951141  3.50546143  5.25976675  3.98902191  2.72837765]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Lasso Regression
Predicted Marks: [2.69582647 2.87930595 2.68850737 1.49693401 3.74163735]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Logistic Regression (used for binary targets)
Predicted Marks: [1 5 1 0 5]
Actual Marks: [0 4 2 1 5]
----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Bayesian Linear Regression
Predicted Marks: [2.31159477 3.48581566 2.23789143 1.12024983 4.12170988]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Support Vector Regression
Predicted Marks: [2.18245617 2.46824315 2.7105891  1.48971462 2.88688946]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Decision Tree Regression
Predicted Marks: [1. 5. 2. 2. 2.]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Gaussian Process Regression
Predicted Marks: [4.60639633e-105 1.29752176e-003 3.99446846e-015 0.00000000e+000
 2.20763019e-042]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: Random Forest Regression
Predicted Marks: [0.98 4.1  1.92 0.84 3.74]
Actual Marks: [0 4 2 1 5]
----------------------------------------
Model: KNN Regression
Predicted Marks: [2.4 3.4 2.8 0.4 2.8]
Actual Marks: [0 4 2 1 5]
----------------------------------------


## GPT2 tetsing Q2

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Function to load dataset
def load_data(file_path):
    return pd.read_csv(file_path)

# Function to handle missing values
def handle_missing_values(data):
    data = data.dropna(subset=[data.columns[-1]])  # Drop rows where the target is NaN
    return data

# Function to split dataset into features and target
def split_data(data):
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target
    return X, y

# Function to split the dataset into training and test sets
def train_test_split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to define models
def define_models():
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "Lasso Regression": Lasso(),
        "Logistic Regression (used for binary targets)": LogisticRegression(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

# Function to train and evaluate models
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        try:
            # Fit the model
            model.fit(X_train, y_train)
            # Predict on the test set
            y_pred = model.predict(X_test)

            # Print results
            print(f"Model: {model_name}")
            print(f"Predicted Marks: {y_pred[:5]}")  # Print first 5 predictions
            print(f"Actual Marks: {y_test[:5].values}")  # Print first 5 actual values
            print("-" * 40)
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            print("-" * 40)

# Main function to execute the workflow
def main(file_path):
    # Load dataset
    data = load_data(file_path)
    
    # Handle missing values
    data = handle_missing_values(data)
    
    # Split into features and target
    X, y = split_data(data)
    
    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split_data(X, y)
    
    # Define models
    models = define_models()
    
    # Train and evaluate models
    train_and_evaluate_models(models, X_train, y_train, X_test, y_test)

# Run the workflow
main("Q2_gpt2.csv")

Model: Linear Regression
Predicted Marks: [2.11834685 0.42194213 2.11834685 0.39638417 5.        ]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Polynomial Regression (Degree 2)
Predicted Marks: [2.40540706 0.20690958 2.40540706 0.39260843 5.        ]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Lasso Regression
Predicted Marks: [2.31823172 0.28457469 2.31823172 1.61368492 4.53090613]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Logistic Regression (used for binary targets)
Predicted Marks: [0. 0. 0. 0. 5.]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Bayesian Linear Regression
Predicted Marks: [2.11834682 0.42194212 2.11834682 0.39638423 4.99999992]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Support Vector Regression
Predicted Marks: [1.85976259 1.79568274 1.85976259 1.89435007 1.98567768]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Decision Tree Regression
Predicted Marks: [1. 2. 1. 5. 5.]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Gaussian Process Regression
Predicted Marks: [1.00640397e-54 9.75237075e-94 1.00640397e-54 1.74700958e-54
 5.00000000e+00]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: Random Forest Regression
Predicted Marks: [1.22 1.25 1.22 1.12 5.  ]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------
Model: KNN Regression
Predicted Marks: [3.4 1.4 3.4 0.4 5. ]
Actual Marks: [4. 0. 4. 0. 5.]
----------------------------------------


## GPT2 tetsing Q3

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Function to load dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Function to handle missing values
def handle_missing_values(data):
    # Drop rows with NaN in the target column or fill them with the mean value
    data = data.dropna(subset=[data.columns[-1]])  # Drop rows where the target is NaN
    # Alternatively: data[data.columns[-1]].fillna(data[data.columns[-1]].mean(), inplace=True)
    return data

# Function to split dataset into features and target
def split_data(data):
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target
    return X, y

# Function to split data into training and test sets
def train_test_split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Function to define models
def define_models():
    models = {
        "Linear Regression": LinearRegression(),
        "Polynomial Regression (Degree 2)": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
        "Lasso Regression": Lasso(),
        "Logistic Regression (used for binary targets)": LogisticRegression(),
        "Bayesian Linear Regression": BayesianRidge(),
        "Support Vector Regression": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "Gaussian Process Regression": GaussianProcessRegressor(),
        "Random Forest Regression": RandomForestRegressor(),
        "KNN Regression": KNeighborsRegressor()
    }
    return models

# Function to train and evaluate models
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test):
    for model_name, model in models.items():
        try:
            # Fit the model
            model.fit(X_train, y_train)
            # Predict on the test set
            y_pred = model.predict(X_test)

            # Convert predictions and actual values to integers
            y_pred_int = y_pred.astype(int)
            y_test_int = y_test.astype(int)

            # Print results
            print(f"Model: {model_name}")
            print(f"Predicted Marks: {list(y_pred_int[:5])}")  # Print first 5 predictions as integers
            print(f"Actual Marks: {list(y_test_int[:5].values)}")  # Print first 5 actual values as integers
            print("-" * 40)
        except Exception as e:
            print(f"Model: {model_name} - Error: {str(e)}")
            print("-" * 40)

# Main function to execute the workflow
def main(file_path):
    # Load dataset
    data = load_data(file_path)
    
    # Handle missing values
    data = handle_missing_values(data)
    
    # Split into features and target
    X, y = split_data(data)
    
    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split_data(X, y)
    
    # Define models
    models = define_models()
    
    # Train and evaluate models
    train_and_evaluate_models(models, X_train, y_train, X_test, y_test)

# Run the workflow
main("Q3_roberta.csv")


Model: Linear Regression
Predicted Marks: [3, 2, 0, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Polynomial Regression (Degree 2)
Predicted Marks: [3, 2, 0, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Lasso Regression
Predicted Marks: [2, 2, 1, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Logistic Regression (used for binary targets)
Predicted Marks: [4, 1, 1, 1, 3]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Bayesian Linear Regression
Predicted Marks: [3, 2, 0, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Support Vector Regression
Predicted Marks: [2, 2, 1, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Decision Tree Regression
Predicted Marks: [0, 4, 1, 4, 3]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Gaussian Process Regression
Predicted Marks: [0, 0, 0, 0, 0]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: Random Forest Regression
Predicted Marks: [2, 2, 1, 2, 2]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
Model: KNN Regression
Predicted Marks: [1, 1, 0, 1, 1]
Actual Marks: [1, 1, 1, 5, 0]
----------------------------------------
