In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor


data_dir = "/data3/lsf/Pein/Power-Prediction/data/"
train_data = pd.read_csv(data_dir + "train_data_92.csv")
test_data = pd.read_csv(data_dir + "test_data_92.csv")

In [None]:
# Define features and target
features = [
    col for col in train_data.columns if col not in ["time", "lead_hour", "power"]
]
X = train_data[features]
y = train_data["power"]

# Define a range of k values to search over
k_values = range(30, len(features) + 1)
k_values = [81]

# Initialize variables to store the best score and corresponding k
best_k = 0
best_score = -np.inf

# Use a LGBMRegressor for evaluating feature importance
model = LGBMRegressor(
    n_estimators=5,
    random_state=42,
    n_jobs=20,
)

# Iterate over k values to find the optimal number of features
for k in k_values:
    print("Evaluating k =", k)
    mi_selector = SelectKBest(mutual_info_regression, k=k)
    X_k_best = mi_selector.fit_transform(X, y)

    # Perform cross-validation and compute the average score
    scores = cross_val_score(
        model, X_k_best, y, cv=5, scoring="neg_mean_squared_error", n_jobs=20
    )
    average_score = np.mean(scores)

    # Update the best score and k if the current score is better
    if average_score > best_score:
        best_score = average_score
        best_k = k

# Select features using the best k
mi_selector = SelectKBest(mutual_info_regression, k=best_k)
mi_selector.fit(X, y)

selected_mi_features = np.array(features)[mi_selector.get_support()]

In [None]:
print(selected_mi_features)
print(best_k, best_score)

In [None]:
# Create new train and test data with selected features
train_data_selected = train_data[
    ["time", "lead_hour"] + selected_mi_features.tolist() + ["power"]
]
test_data_selected = test_data[
    ["time", "lead_hour"] + selected_mi_features.tolist() + ["power"]
]

# Ensure 'power' is non-zero, setting negative values to 0
train_data_selected["power"] = train_data_selected["power"].apply(lambda x: max(x, 0))
test_data_selected["power"] = test_data_selected["power"].apply(lambda x: max(x, 0))


print(train_data_selected.shape)
print(test_data_selected.shape)

In [None]:
# Save the resulting DataFrames
train_data_selected.to_csv(data_dir + "train_data_84.csv", index=False)
test_data_selected.to_csv(data_dir + "test_data_84.csv", index=False)

### Feature Importance Selection