In [15]:
import pandas as pd
import numpy as np


data_dir = "/data3/lsf/Pein/Power-Prediction/data/"
file_name_common = 'farm_98_withTime.csv'
train_data = pd.read_csv(data_dir + f"train_{file_name_common}")
test_data = pd.read_csv(data_dir + f"test_{file_name_common}")

In [16]:
print(f'train_data shape: {train_data.shape}, test_data shape: {test_data.shape}')

train_data shape: (14592, 98), test_data shape: (2880, 98)


In [17]:
# Find the index of the 'power' column
power_col_index = train_data.columns.get_loc('power')

# Separate columns after 'power'
train_data_after_power = train_data.iloc[:, power_col_index + 1:]
test_data_after_power = test_data.iloc[:, power_col_index + 1:]

# Drop columns after 'power' in both train and test data
train_data = train_data.iloc[:, :power_col_index + 1]
test_data = test_data.iloc[:, :power_col_index + 1]

In [18]:
features = [
    col for col in train_data.columns if col not in ["time", "lead_hour", "power"]
]
X = train_data[features]
y = train_data["power"]

In [19]:
# Calculate the correlation matrix
corr_matrix = X.corr().abs()

# Set the threshold for correlation
threshold = 0.95

# Identify groups of features with high correlation
groups = {}
for i in range(len(corr_matrix.columns)):
    feature = corr_matrix.columns[i]
    if feature not in groups:
        groups[feature] = {feature}
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            correlated_feature = corr_matrix.columns[j]
            print(f"{feature} and {correlated_feature} are highly correlated with value {corr_matrix.iloc[i, j]}")
            groups[feature].add(correlated_feature)
            groups[correlated_feature] = groups[feature]

u200 and u100 are highly correlated with value 0.994829574591168
ws10 and fg10 are highly correlated with value 0.9753618887508309
ws10 and flsr are highly correlated with value 0.9620513696418376
u10 and u100 are highly correlated with value 0.9914585818093155
u10 and u200 are highly correlated with value 0.9828139569086124
ws100 and ws10 are highly correlated with value 0.9632489030801169
ws100 and ws200 are highly correlated with value 0.9856607569397507
i10fg and fg10 are highly correlated with value 0.9890530880488151
i10fg and flsr are highly correlated with value 0.9589701782536405
i10fg and ws10 are highly correlated with value 0.9917406597777553
mlcape50 and mlcape100 are highly correlated with value 0.9669548077154803
u10n and u100 are highly correlated with value 0.9847691582599196
u10n and u200 are highly correlated with value 0.9758922721185141
u10n and u10 are highly correlated with value 0.9977510542481862
hwbt1 and hwbt0 are highly correlated with value 0.99082441962169

In [20]:
# Load feature importance data
feat_imp = pd.read_csv('/data3/lsf/Pein/Power-Prediction/feature_selection/farm_important_features.csv')

# Filter out features not in corr_matrix
feat_imp = feat_imp[feat_imp['Feature'].isin(corr_matrix.columns)]

# Sort features by importance
feat_imp_sorted = feat_imp.sort_values(by='Importance', ascending=False)

# Select the most important feature from each group
selected_features = set()
processed_features = set()

for _, row in feat_imp_sorted.iterrows():
    feature = row['Feature']
    if feature not in processed_features:
        selected_features.add(feature)
        # Mark all features in the same group as processed
        processed_features.update(groups[feature])

# Print selected features
print("Selected features:", selected_features)


Selected features: {'p3020', 'mgws', 'tcc', 'deg0l', 'sund', 'v200', 'blh', 'ssrd', 'totalx', 'trpp', 'tcw', 'bld', 'ishf', 'ilspf', 'viwve', 'ttr', 'mcc', 'vimd', 'slhf', 'lgws', 'mudlp', 'kx', 'tcsw', 'u100', 'azimuth', 'ttrc', 'dsrp', 'strd', 'nsss', 'hwbt0', 'altitude', 'd2m', 'mld', 'msl', 'capes', 'fg10', 'str', 'litoti', 'lspf', 'degm10l', 'cape', 'strdc', 'gwd', 'ewss', 'sst', 'strc', 'cdir', 'viwvn', 'hcc', 'ws200', 'lcc'}


In [21]:
# Ensure selected_features is a list
selected_features = list(selected_features)

# Create new DataFrames with the selected features before 'power'
train_data_selected = train_data[['time'] + selected_features + ['power']]
test_data_selected = test_data[['time'] + selected_features + ['power']]

print(f'shape of train_data_selected: {train_data_selected.shape}, shape of test_data_selected: {test_data_selected.shape}')

shape of train_data_selected: (14592, 53), shape of test_data_selected: (2880, 53)


In [22]:
# Concatenate the columns after 'power' back to the DataFrames
train_data_final = pd.concat([train_data_selected, train_data_after_power], axis=1)
test_data_final = pd.concat([test_data_selected, test_data_after_power], axis=1)

print(f'train_data_final shape: {train_data_final.shape}, test_data_final shape: {test_data_final.shape}')

train_data_final shape: (14592, 65), test_data_final shape: (2880, 65)


In [23]:
file_name_common = 'farm_65_withTime.csv'

# Export the new DataFrames to CSV files
train_data_final.to_csv(f'/data3/lsf/Pein/Power-Prediction/data/train_{file_name_common}', index=False)
test_data_final.to_csv(f'/data3/lsf/Pein/Power-Prediction/data/test_{file_name_common}', index=False)
