In [1]:
import pandas as pd
import numpy as np


data_dir = "/data3/lsf/Pein/Power-Prediction/data/"
file_name_common = 'farm_65_withTime.csv'
train_data = pd.read_csv(data_dir + f"train_{file_name_common}")
test_data = pd.read_csv(data_dir + f"test_{file_name_common}")

In [2]:
print(f'train_data shape: {train_data.shape}, test_data shape: {test_data.shape}')

train_data shape: (14592, 65), test_data shape: (2880, 65)


In [3]:
# Find the index of the 'power' column
power_col_index = train_data.columns.get_loc('power')

# Separate columns after 'power'
train_data_after_power = train_data.iloc[:, power_col_index + 1:]
test_data_after_power = test_data.iloc[:, power_col_index + 1:]

# Drop columns after 'power' in both train and test data
train_data = train_data.iloc[:, :power_col_index + 1]
test_data = test_data.iloc[:, :power_col_index + 1]

In [4]:
features = [
    col for col in train_data.columns if col not in ["time", "lead_hour", "power"]
]
X = train_data[features]
y = train_data["power"]

In [26]:
# Calculate the correlation matrix
corr_matrix = X.corr().abs()

# Set the threshold for correlation
threshold = 0.8

# Identify groups of features with high correlation
groups = {}
for i in range(len(corr_matrix.columns)):
    feature = corr_matrix.columns[i]
    if feature not in groups:
        groups[feature] = {feature}
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            correlated_feature = corr_matrix.columns[j]
            print(f"{feature} and {correlated_feature} are highly correlated with value {corr_matrix.iloc[i, j]}")
            groups[feature].add(correlated_feature)
            groups[correlated_feature] = groups[feature]

slhf and ishf are highly correlated with value 0.8936591761972728
kx and tcw are highly correlated with value 0.8308641050047784
u100 and viwve are highly correlated with value 0.8098364902149452
dsrp and ssrd are highly correlated with value 0.919063091411602
strd and tcw are highly correlated with value 0.8458816551036464
hwbt0 and tcw are highly correlated with value 0.9090302271242071
altitude and ssrd are highly correlated with value 0.8086281593963618
mld and d2m are highly correlated with value 0.8478418432129948
msl and d2m are highly correlated with value 0.8787024188073399
cape and capes are highly correlated with value 0.8066370774486115
strdc and tcw are highly correlated with value 0.888548745123214
strdc and strd are highly correlated with value 0.8507278880825903
strdc and hwbt0 are highly correlated with value 0.8160927922664164
strdc and d2m are highly correlated with value 0.9411586699247788
strdc and mld are highly correlated with value 0.833853698810103
strdc and ms

In [27]:
# Load feature importance data
feat_imp = pd.read_csv('/data3/lsf/Pein/Power-Prediction/feature_selection/farm_important_features.csv')

# Filter out features not in corr_matrix
feat_imp = feat_imp[feat_imp['Feature'].isin(corr_matrix.columns)]

# Sort features by importance
feat_imp_sorted = feat_imp.sort_values(by='Importance', ascending=False)

# Select the most important feature from each group
selected_features = set()
processed_features = set()

for _, row in feat_imp_sorted.iterrows():
    feature = row['Feature']
    if feature not in processed_features:
        selected_features.add(feature)
        # Mark all features in the same group as processed
        processed_features.update(groups[feature])

# Print selected features
print("Selected features:", selected_features)


Selected features: {'p3020', 'lspf', 'litoti', 'viwvn', 'fg10', 'viwve', 'ishf', 'kx', 'strc', 'gwd', 'mcc', 'ttrc', 'totalx', 'ewss', 'deg0l', 'hcc', 'bld', 'tcc', 'altitude', 'degm10l', 'trpp', 'lcc', 'str', 'ttr', 'azimuth', 'mudlp', 'sst', 'mld', 'vimd', 'ilspf', 'tcsw', 'capes', 'lgws', 'mgws', 'dsrp', 'sund', 'blh'}


In [28]:
# Ensure selected_features is a list
selected_features = list(selected_features)

# Create new DataFrames with the selected features before 'power'
train_data_selected = train_data[['time'] + selected_features + ['power']]
test_data_selected = test_data[['time'] + selected_features + ['power']]

print(f'shape of train_data_selected: {train_data_selected.shape}, shape of test_data_selected: {test_data_selected.shape}')

shape of train_data_selected: (14592, 39), shape of test_data_selected: (2880, 39)


In [29]:
# Concatenate the columns after 'power' back to the DataFrames
train_data_final = pd.concat([train_data_selected, train_data_after_power], axis=1)
test_data_final = pd.concat([test_data_selected, test_data_after_power], axis=1)

print(f'train_data_final shape: {train_data_final.shape}, test_data_final shape: {test_data_final.shape}')

train_data_final shape: (14592, 51), test_data_final shape: (2880, 51)


In [30]:
file_name_common = 'farm_51_withTime.csv'

# Export the new DataFrames to CSV files
train_data_final.to_csv(f'/data3/lsf/Pein/Power-Prediction/data/train_{file_name_common}', index=False)
test_data_final.to_csv(f'/data3/lsf/Pein/Power-Prediction/data/test_{file_name_common}', index=False)
