In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest,  mutual_info_regression
from sklearn.model_selection import train_test_split
import pickle

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

pickle_file_path = './data/train_df.pickle'
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

with open(pickle_file_path, 'rb') as f:
    df = pickle.load(f)

# df.info()
# print(df.head())

column_names = df.columns[1:-18].tolist()    
X = df[column_names]  # Oletetaan, että viimeiset 6 saraketta ovat kohteita

num_select = X.shape[1] // 2
feature_scores = {column: 0 for column in column_names}
feature_scores_df = pd.DataFrame(index=column_names)

for target in mean_columns:    
    print(f"Selecting features for target: {target}")
    
    y = df[target]
    # Oletetaan, että haluat valita K parasta ominaisuutta, esim. 20
    selector = SelectKBest(score_func=mutual_info_regression, k=num_select)
    selector.fit(X, y)
    
    # Merkitse valitut ominaisuudet
    selected_features = X.columns[selector.get_support()]
    
    # Lisää valittujen ominaisuuksien pisteet
    for feature in selected_features:
        feature_scores[feature] += 1
    
    feature_scores_df[target] = pd.Series(selector.scores_, index=X.columns)
    
print("\nFeature scores for each target:")
print(feature_scores_df.fillna(0))  # NaN arvot täytetään nollilla, jos jokin ominaisuus ei valittu 

sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
top_features = [feature for feature, score in sorted_features[:len(sorted_features) // 2]]

# Tulostetaan top-ominaisuudet ja niiden pisteet
top_features_df = pd.DataFrame(sorted_features[:len(sorted_features) // 2], columns=['Feature', 'Score']).set_index('Feature')
print("\nTop features selected across all targets:")
print(top_features_df)

selected_features_pickle_path = './data/selected_features_list.pickle'
with open(selected_features_pickle_path, 'wb') as f:
    pickle.dump(top_features, f)




Selecting features for target: X4_mean
Selecting features for target: X11_mean
Selecting features for target: X18_mean
