In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
import os

In [2]:
def load_data(file_path):
    """Load remaining features from a pickle file."""
    return pd.read_pickle(file_path)
    
# # Load the remaining features
# q1_features = load_data('Datasave/Q1_remaining_features.pkl')
# q2_features = load_data('Datasave/Q2_remaining_features.pkl')

# display(q1_features)


In [3]:
def batch_impute(data, batch_size=100000):
    """Perform batch imputation, excluding time columns."""
    # Identify columns that are not of datetime.time type
    non_time_columns = [col for col in data.columns if pd.api.types.is_numeric_dtype(data[col])]

    # Select only non-time columns for imputation
    data_to_impute = data[non_time_columns]

    imputer = SimpleImputer(strategy='mean')
    imputed_data = []
    for start in range(0, len(data_to_impute), batch_size):
        end = min(start + batch_size, len(data_to_impute))
        imputed_batch = imputer.fit_transform(data_to_impute.iloc[start:end])
        imputed_data.append(imputed_batch)

    # Combine imputed data with original time columns
    imputed_data = np.vstack(imputed_data)
    imputed_df = pd.DataFrame(imputed_data, columns=non_time_columns)
    return pd.concat([imputed_df, data[data.columns.difference(non_time_columns)]], axis=1)



def evaluate_model(selected_indices, X, y):
    """Evaluate model based on selected feature indices."""
    # Select features based on passed indices
    top_features = X.iloc[:, selected_indices]
    X_train, X_test, y_train, y_test = train_test_split(top_features, y, test_size=0.2, random_state=42)

    model = GradientBoostingRegressor(random_state=42, n_estimators=50, max_depth=2)
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1)

    # Extract and sort feature importances
    feature_importances = model.feature_importances_
    sorted_idx = np.argsort(feature_importances)[::-1]

    return np.mean(np.abs(scores)), sorted_idx

def find_optimal_features(data, y_column, feature_count_range):
    sampled_data = data.sample(frac=0.1).dropna(subset=[y_column])
    X_sampled_imputed = batch_impute(sampled_data.drop(columns=[y_column]))
    y_sampled = sampled_data[y_column].values

    # Ensure that X and y have the same number of rows
    min_length = min(len(X_sampled_imputed), len(y_sampled))
    X_sampled_imputed = X_sampled_imputed[:min_length]
    y_sampled = y_sampled[:min_length]

    best_score = float('inf')
    best_n = 0
    best_features = None
    for n in feature_count_range:
        start_time = time.time()
        feature_indices = np.arange(n)
        score, sorted_idx = evaluate_model(feature_indices, X_sampled_imputed, y_sampled)
        if score < best_score:
            best_score = score
            best_n = n
            best_features = sorted_idx[:n]
        end_time = time.time()
        print(f"Evaluating {n} features took {end_time - start_time:.2f} seconds.")

    return best_n, best_score, best_features


# # Apply the modified functions
# best_n_y1, best_score_y1, best_features_y1 = find_optimal_features(q1_features, 'Y1', range(5, 10))
# print(f"Best number of features for Y1: {best_n_y1} with RMSE: {best_score_y1}")
# print(f"Selected features for Y1: {q1_features.columns[best_features_y1].tolist()}")

# best_n_y2, best_score_y2, best_features_y2 = find_optimal_features(q2_features, 'Y2', range(5, 10))
# print(f"Best number of features for Y2: {best_n_y2} with RMSE: {best_score_y2}")
# print(f"Selected features for Y2: {q2_features.columns[best_features_y2].tolist()}")

In [4]:
def main(y_var, data_path, feature_count_range):
    data = load_data(data_path)
    best_n, best_score, best_features = find_optimal_features(data, y_var, feature_count_range)
    print(f"Best number of features for {y_var}: {best_n} with RMSE: {best_score}")
    selected_features = data.columns[best_features].tolist()
    print(f"Selected features for {y_var}: {selected_features}")

    # Data with selected features and target variable
    selected_data = data[[y_var] + selected_features]
    display(selected_data)

    # Save the selected data DataFrame
    selected_data.to_pickle(f"Datasave/{y_var}_Selected_GBR.pkl")


if __name__ == "__main__":
    # Create Datasave directory if it doesn't exist
    if not os.path.exists("Datasave"):
        os.makedirs("Datasave")

    # Run for Y1 and Y2 variables
    main('Y1', 'Datasave/Q1_remaining_features.pkl', range(5, 10))
    main('Y2', 'Datasave/Q2_remaining_features.pkl', range(5, 10))

Evaluating 5 features took 44.45 seconds.
Evaluating 6 features took 52.45 seconds.
Evaluating 7 features took 61.20 seconds.
Evaluating 8 features took 69.77 seconds.
Evaluating 9 features took 73.62 seconds.
Best number of features for Y1: 6 with RMSE: 12.369761838431375
Selected features for Y1: ['X10', 'X106', 'X100', 'X1', 'X102', 'X104']


Unnamed: 0_level_0,Y1,X10,X106,X100,X1,X102,X104
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-03 09:45:01,-4.967191e+01,-0.440099,0.223038,0.044736,-1.985130,-0.453612,-0.267032
2022-01-03 09:45:02,-4.889528e+01,-0.380986,-0.737723,-0.854499,-0.933646,-0.622912,-0.177605
2022-01-03 09:45:03,-4.706744e+01,-0.180915,-0.777266,-0.854372,-0.579249,-0.320432,0.135138
2022-01-03 09:45:04,-4.759041e+01,-0.028344,-0.314707,-0.044026,-0.658292,-0.057465,0.516732
2022-01-03 09:45:05,-4.811086e+01,-0.100419,0.404281,0.273158,-1.331419,0.373589,0.562226
...,...,...,...,...,...,...,...
2023-02-28 15:59:56,-8.803647e+00,,,,,,
2023-02-28 15:59:57,-8.175329e+00,,,,,,
2023-02-28 15:59:58,-5.661267e+00,-1.136721,-0.615870,,-1.213168,-1.743618,-1.879111
2023-02-28 15:59:59,-3.774891e+00,,,,,,


Evaluating 5 features took 34.66 seconds.
Evaluating 6 features took 39.74 seconds.
Evaluating 7 features took 45.99 seconds.
Evaluating 8 features took 48.77 seconds.
Evaluating 9 features took 53.36 seconds.
Best number of features for Y2: 9 with RMSE: 16.246717165880288
Selected features for Y2: ['X119', 'X121', 'X120', 'X10', 'X118', 'X1', 'X123', 'X105', 'X122']


Unnamed: 0_level_0,Y2,X119,X121,X120,X10,X118,X1,X123,X105,X122
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-03 09:45:01,-61.707756,0.0,-0.248765,0.000000,-0.440099,0.0,-1.985130,-0.166139,-0.358980,0.000000
2022-01-03 09:45:02,-62.025402,0.0,-0.541802,0.132091,-0.380986,0.0,-0.933646,-0.954241,-1.275239,-0.775565
2022-01-03 09:45:03,-57.324017,0.0,-0.247763,0.949864,-0.180915,0.0,-0.579249,0.000000,-1.292093,0.000000
2022-01-03 09:45:04,-58.769417,0.0,0.000000,1.211052,-0.028344,0.0,-0.658292,0.000000,-0.481662,0.000000
2022-01-03 09:45:05,-59.984489,0.0,0.000000,1.127890,-0.100419,0.0,-1.331419,0.045356,-0.157728,0.236589
...,...,...,...,...,...,...,...,...,...,...
2023-02-28 15:59:56,-5.074302,,,,,,,,,
2023-02-28 15:59:57,-4.039149,,,,,,,,,
2023-02-28 15:59:58,-1.761056,,,,,,,,,
2023-02-28 15:59:59,-1.346745,,,,,,,,,
