In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor


data_dir = "/data/Pein/Pytorch/Wind-Power-Prediction/data/"
train_data = pd.read_csv(data_dir + "train_farm_92.csv")
test_data = pd.read_csv(data_dir + "test_farm_92.csv")

In [2]:
print(f'shape of train data: {train_data.shape}, shape of test data: {test_data.shape}')

shape of train data: (14592, 92), shape of test data: (2880, 92)


In [3]:
# Define features and target
features = [
    col for col in train_data.columns if col not in ["time", "lead_hour", "power"]
]
X = train_data[features]
y = train_data["power"]

# Define a range of k values to search over
k_values = range(60, len(features) + 1)
# k_values = [81]

# Initialize variables to store the best score and corresponding k
best_k = 0
best_score = -np.inf

# Use a LGBMRegressor for evaluating feature importance
model = LGBMRegressor(
    n_estimators=5,
    random_state=42,
    n_jobs=16,
)

# Iterate over k values to find the optimal number of features
for k in k_values:
    print("Evaluating k =", k)
    mi_selector = SelectKBest(mutual_info_regression, k=k)
    X_k_best = mi_selector.fit_transform(X, y)

    # Perform cross-validation and compute the average score
    scores = cross_val_score(
        model, X_k_best, y, cv=5, scoring="neg_mean_squared_error", n_jobs=20
    )
    average_score = np.mean(scores)

    # Update the best score and k if the current score is better
    if average_score > best_score:
        best_score = average_score
        best_k = k

# Select features using the best k
mi_selector = SelectKBest(mutual_info_regression, k=best_k)
mi_selector.fit(X, y)

selected_mi_features = np.array(features)[mi_selector.get_support()]

Evaluating k = 60
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15300
[LightGBM] [Info] Total Bins 15300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15300
[LightGBM] [Info] Number of data points in the train set: 11674, number of used features: 60
[LightGBM] [Info] Start training from score 564.184079
[LightGBM] [Info] Number of data points in the

In [4]:
print(selected_mi_features)
print(best_k, best_score)

['u100' 'v100' 'fg10' 'u10' 'v10' 'u200' 'v200' 'd2m' 't2m' 'bld' 'cdir'
 'degm10l' 'dsrp' 'ewss' 'fdir' 'flsr' 'gwd' 'hwbt0' 'hwbt1' 'i10fg'
 'ishf' 'lblt' 'lmlt' 'lspf' 'ltlt' 'mld' 'mn2t' 'mx2t' 'nsss' 'parcs'
 'par' 'skt' 'slhf' 'sshf' 'ssrc' 'ssrdc' 'ssrd' 'ssr' 'sst' 'stl1' 'stl2'
 'stl3' 'stl4' 'strc' 'strdc' 'strd' 'str' 'sund' 'tcw' 'tcwv' 'tisr'
 'tsrc' 'tsr' 'ttrc' 'ttr' 'u10n' 'uvb' 'v10n' 'vimd' 'viwvn' 'ws200'
 'ws100' 'ws10']
63 -131951.70994668955


In [5]:
# Create new train and test data with selected features
train_data_selected = train_data[
    ["time", "lead_hour"] + selected_mi_features.tolist() + ["power"]
]
test_data_selected = test_data[
    ["time", "lead_hour"] + selected_mi_features.tolist() + ["power"]
]

# Ensure 'power' is non-zero, setting negative values to 0
train_data_selected["power"] = train_data_selected["power"].apply(lambda x: max(x, 0))
test_data_selected["power"] = test_data_selected["power"].apply(lambda x: max(x, 0))


print(train_data_selected.shape)
print(test_data_selected.shape)

(14592, 66)
(2880, 66)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected["power"] = train_data_selected["power"].apply(lambda x: max(x, 0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_selected["power"] = test_data_selected["power"].apply(lambda x: max(x, 0))


In [6]:
# Save the resulting DataFrames
train_data_selected.to_csv(data_dir + "train_farm_66.csv", index=False)
test_data_selected.to_csv(data_dir + "test_farm_66.csv", index=False)

### Feature Importance Selection