In [6]:
#Import all libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [7]:
# Load dataset
def load_csv_dataset(file_path):
    """
    This function takes a path string to a CSV file and loads it into a Pandas Dataframe.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    df = pd.read_csv(file_path)
    return df

In [8]:
# Create target variable and predictor variables
def create_target_predictors(data: pd.DataFrame, target: str = "estimated_stock_pct"):
    """
    This function takes in a Pandas Dataframe and splits the columns into a target column and a set of predictor variables.
    This split will be used to train a supervised machine learning model.

    Args:
        data (pd.DataFrame): The input dataset.
        target (str): The name of the variable you want to predict.

    Returns:
        X (pd.DataFrame): Predictor variables.
        y (pd.Series): Target variable.
    """
    X = data.drop(columns=[target])
    y = data[target]
    return X, y


In [14]:
# Train algorithm
def train_algorithm_with_cross_validation(X: pd.DataFrame, y: pd.Series, n_splits: int = 5):
    """
    This function takes the predictor and target variables and
    trains a Random Forest Regressor model across K folds. Using
    cross-validation, performance metrics will be output for each
    fold during training.

    Args:
        X (pd.DataFrame): Predictor variables.
        y (pd.Series): Target variable.
        n_splits (int): Number of folds for cross-validation.

    Returns:
        float: The average MAE across all folds.
    """
    # Create a list that will store the accuracies of each fold
    accuracy = []

    # Initialize the KFold cross-validator
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    # Loop through each fold
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        # Split the data into training and testing sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Instantiate algorithm and scaler
        model = RandomForestRegressor()
        scaler = StandardScaler()

        # Scale X data
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Train model
        trained_model = model.fit(X_train, y_train)

        # Generate predictions on test sample
        y_pred = trained_model.predict(X_test)

        # Compute accuracy using mean absolute error
        mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
        accuracy.append(mae)
        print(f"Fold {fold + 1}: MAE = {mae:.3f}")

    # Compute the average MAE across all folds
    avg_mae = sum(accuracy) / len(accuracy)
    print(f"Average MAE: {avg_mae:.2f}")
    return avg_mae

# Example usage
# file_path = "path/to/your/csvfile.csv"
# df = load_csv_dataset(file_path)
# X, y = create_target_predictors(df, target="estimated_stock_pct")
# avg_mae = train_algorithm_with_cross_validation(X, y)

