In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

train_input_path = 'train_input_merge.csv'
train_power_farm_path = 'train_power_farm.csv'

test_input_path = 'test_power_farm.csv'
test_power_farm_path = 'test_power_farm.csv'

train_input = pd.read_csv(train_input_path)
train_power_farm = pd.read_csv(train_power_farm_path)

test_input = pd.read_csv(test_input_path)
test_power_farm = pd.read_csv(test_power_farm_path)

In [3]:
train_input.columns

Index(['initial_time', 'time', 'lead_hour', 'altitude', 'azimuth', 'u100',
       'v100', 'fg10', 'u10', 'v10',
       ...
       'uvb', 'v10n', 'vimd', 'p3020', 'viwve', 'viwvn', 'zust', 'ws200',
       'ws100', 'ws10'],
      dtype='object', length=131)

In [4]:
from sklearn.preprocessing import StandardScaler
# Ignore the first 3 columns
X = train_input.iloc[:, 3:]

# Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Iterative VIF selection
vif_threshold = 50
max_vif = float('inf')
iteration = 1

while max_vif > vif_threshold:
    print(f"Iteration {iteration}:")
    X_with_constant = sm.add_constant(X)
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X_with_constant.values, i + 1) for i in range(len(X.columns))]
    
    max_vif = vif_data["VIF"].max()
    print(vif_data)

    if max_vif > vif_threshold:
        feature_to_remove = vif_data.loc[vif_data["VIF"].idxmax(), "Feature"]
        print(f"Removing feature with highest VIF: {feature_to_remove} (VIF={max_vif})")
        X = X.drop(columns=[feature_to_remove])
    else:
        print(f"All features have VIF below the threshold of {vif_threshold}.")

    iteration += 1

# Final selected features
print("Final selected features:")
print(X.columns.tolist())

Iteration 1:
      Feature          VIF
0    altitude    10.168717
1     azimuth     2.101530
2        u100  1544.342219
3        v100  2970.205947
4        fg10    84.307642
..        ...          ...
123     viwvn    11.392286
124      zust   843.203633
125     ws200   317.243789
126     ws100  1108.036903
127      ws10   816.853848

[128 rows x 2 columns]
Removing feature with highest VIF: lblt (VIF=677071394.9670656)
Iteration 2:
      Feature          VIF
0    altitude    10.151434
1     azimuth     2.101048
2        u100  1543.813641
3        v100  2970.061382
4        fg10    84.096957
..        ...          ...
122     viwvn    11.392268
123      zust   843.093398
124     ws200   317.218837
125     ws100  1107.694582
126      ws10   816.604539

[127 rows x 2 columns]
Removing feature with highest VIF: lmlt (VIF=290696749.7014346)
Iteration 3:
      Feature          VIF
0    altitude    10.150900
1     azimuth     2.100685
2        u100  1543.774904
3        v100  2968.718596
4 

In [6]:
print(f"Final selected features: with shape {X.shape}")
print(X.columns.tolist())
# save the X.columns to csv
saved = pd.DataFrame(X.columns.tolist())
saved.to_csv('1-vif_82_feat.csv', index=False,columns=['feat'])

Final selected features: with shape (14592, 82)
['altitude', 'azimuth', 'u200', 'd2m', 'bld', 'blh', 'cape', 'capes', 'chnk', 'cp', 'crr', 'csfr', 'deg0l', 'degm10l', 'dsrp', 'es', 'ewss', 'fal', 'fdir', 'flsr', 'fsr', 'fzra', 'gwd', 'hcc', 'hwbt1', 'iews', 'ilspf', 'inss', 'istl1', 'istl2', 'istl3', 'istl4', 'kx', 'lcc', 'lgws', 'licd', 'lict', 'litoti', 'lspf', 'lsp', 'lssfr', 'mcc', 'mgws', 'mlcape100', 'mlcape50', 'mld', 'mudlp', 'mxtpr', 'nsss', 'uoe', 'von', 'pev', 'totalx', 'ro', 'sf', 'slhf', 'smlt', 'sp', 'sro', 'sshf', 'ssro', 'str', 'sund', 'tcc', 'tciw', 'tclw', 'tco3', 'tcrw', 'tcslw', 'tcsw', 'tcwv', 'tisr', 'trpp', 'tsn', 'ttrc', 'ttr', 'v10n', 'vimd', 'p3020', 'viwve', 'viwvn', 'ws200']


In [25]:
from sklearn.feature_selection import VarianceThreshold
# Assuming a variance threshold of 0.01 for demonstration purposes
variance_selector = VarianceThreshold(threshold=0.01)
X_variance_selected = variance_selector.fit_transform(X)

# Get the column names of the features that are kept after variance thresholding
selected_features_after_variance = X.columns[variance_selector.get_support()]

print("Features selected after applying VarianceThreshold:",len(selected_features_after_variance))
print(selected_features_after_variance.tolist())

Features selected after applying VarianceThreshold: 52
['altitude', 'azimuth', 'u200', 'd2m', 'bld', 'blh', 'cape', 'capes', 'deg0l', 'degm10l', 'dsrp', 'ewss', 'fdir', 'flsr', 'gwd', 'hcc', 'hwbt1', 'ilspf', 'kx', 'lcc', 'lgws', 'litoti', 'lspf', 'mcc', 'mgws', 'mlcape100', 'mlcape50', 'mld', 'mudlp', 'nsss', 'uoe', 'von', 'totalx', 'slhf', 'sp', 'sshf', 'str', 'sund', 'tcc', 'tcrw', 'tcsw', 'tcwv', 'tisr', 'trpp', 'ttrc', 'ttr', 'v10n', 'vimd', 'p3020', 'viwve', 'viwvn', 'ws200']


In [29]:
saved = pd.DataFrame(selected_features_after_variance.tolist(), columns=['feat'])
saved.to_csv('2-var_thres_52_feat.csv',index=False)

In [28]:
import pandas as pd
# File paths
train_input_path = 'train_input_merge.csv'
train_power_farm_path = 'train_power_farm.csv'

test_input_path = 'test_input_merge.csv'
test_power_farm_path = 'test_power_farm.csv'

selected_features_path = '2-var_thres_52_feat.csv'

# Load data
train_input = pd.read_csv(train_input_path)
train_power_farm = pd.read_csv(train_power_farm_path)
test_input = pd.read_csv(test_input_path)
test_power_farm = pd.read_csv(test_power_farm_path)

# Load selected features
selected_features_df = pd.read_csv(selected_features_path)
selected_features = selected_features_df['feat'].tolist()
selected_features = [col.strip().lower() for col in selected_features]

# Ensure the 'time' column is retained for later checks
selected_features.append('time')
selected_features.append('lead_hour')


In [29]:
print(train_input.columns)
print(test_input.columns)


Index(['initial_time', 'time', 'lead_hour', 'altitude', 'azimuth', 'u100',
       'v100', 'fg10', 'u10', 'v10',
       ...
       'uvb', 'v10n', 'vimd', 'p3020', 'viwve', 'viwvn', 'zust', 'ws200',
       'ws100', 'ws10'],
      dtype='object', length=131)
Index(['initial_time', 'time', 'lead_hour', 'altitude', 'azimuth', 'u100',
       'v100', 'fg10', 'u10', 'v10',
       ...
       'uvb', 'v10n', 'vimd', 'p3020', 'viwve', 'viwvn', 'zust', 'ws200',
       'ws100', 'ws10'],
      dtype='object', length=131)


In [31]:
# Drop columns not in selected_features
train_input_filtered = train_input[selected_features]
test_input_filtered = test_input[selected_features]

# Print the shapes of the filtered datasets to confirm
print("Filtered train_input shape:", train_input_filtered.shape)
print("Filtered test_input shape:", test_input_filtered.shape)



# Merge the selected features with the target variable
train_X = train_input[selected_features]
train_y = train_power_farm['power']

Filtered train_input shape: (14592, 54)
Filtered test_input shape: (2880, 54)


In [32]:

test_X = test_input[selected_features]
test_y = test_power_farm['power']

# Check if the 'time' columns are identical in the training datasets
train_time_identical = train_input['time'].equals(train_power_farm['time'])
print(f"Are 'time' columns identical in the training data? {train_time_identical}")


# Check if the 'time' columns are identical in the testing datasets
test_time_identical = test_input['time'].equals(test_power_farm['time'])
print(f"Are 'time' columns identical in the testing data? {test_time_identical}")


Are 'time' columns identical in the training data? True
Are 'time' columns identical in the testing data? True


In [34]:

# Optionally, you can merge them into a single DataFrame
train_merged = pd.concat([train_X, train_y], axis=1)
test_merged = pd.concat([test_X, test_y], axis=1)

print("Merged training and testing data with selected features:")
print(train_merged.shape)
print(test_merged.shape)

Merged training and testing data with selected features:
(14592, 55)
(2880, 55)


In [35]:
# Save the merged dataframes if needed
train_merged.to_csv('3-train_merged_55.csv', index=False)
test_merged.to_csv('3-test_merged_55.csv', index=False)

## Prepare the full data

In [2]:
import pandas as pd

train_input_path = 'train_input_merge.csv'
train_power_farm_path = 'train_power_farm.csv'

test_input_path = 'test_input_merge.csv'
test_power_farm_path = 'test_power_farm.csv'

train_X = pd.read_csv(train_input_path)
train_y = pd.read_csv(train_power_farm_path)

test_X = pd.read_csv(test_input_path)
test_y = pd.read_csv(test_power_farm_path)

# Concat X and y['power'] columns
train = pd.concat([train_X, train_y['power']], axis=1)
test = pd.concat([test_X, test_y['power']], axis=1)

print(f'shape of train is {train.shape}, shape of test is {test.shape}')

shape of train is (14592, 132), shape of test is (2880, 132)


In [3]:
train.to_csv('3-train_full_132.csv', index=False)
test.to_csv('3-test_full_132.csv', index=False)

## Tree Selection
Go to next jupyter