# Feature selection by filter method

In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold

# load data

In [None]:
df = pd.read_csv("../processed_data/allfeatures.csv", index_col="group.cmp")
used_id = joblib.load("../data/selected_1008id.pkl")  # Load the list of selected IDs
df = df.loc[df.index.isin(used_id)]

## scaling

In [None]:
ss = preprocessing.StandardScaler()
X_scaled = pd.DataFrame(ss.fit_transform(df), index=df.index, columns=df.columns)
df.to_csv("../processed_data/X.csv")
X_scaled.to_csv("../processed_data/X_scaled.csv")

## feature selection: variance > 0

In [None]:
th = 0
th_var = VarianceThreshold(threshold=th)
th_var.fit(df)
X_var_over0 = df.loc[:, th_var.get_support()]

## Function to search for highly correlated variables

In [None]:
#https://github.com/hkaneko1985/variable_selection_and_clustering_based_on_r/blob/master/variable_selection_based_on_r.py
def search_highly_correlated_variables(x, threshold_of_r):
    """
    Search variables whose absolute correlation coefficient is higher than threshold_of_r.

    Parameters
    ----------
    x : numpy.array or pandas.DataFrame
        The input data for which to find highly correlated features.
    threshold_of_r : float
        The threshold of the correlation coefficient to identify highly correlated features.

    Returns
    -------
    highly_correlated_variable_numbers : list
        The indices of variables that should be deleted.
    """
    r_in_x = x.corr().abs()
    np.fill_diagonal(r_in_x.values, 0)
    highly_correlated_variable_numbers = []

    while True:
        r_max = r_in_x.max().max()
        if r_max < threshold_of_r:
            break
        variable_number_1 = r_in_x.max().idxmax()
        variable_number_2 = r_in_x[variable_number_1].idxmax()
        r_sum_1 = r_in_x[variable_number_1].sum()
        r_sum_2 = r_in_x[variable_number_2].sum()

        delete_x_number = variable_number_1 if r_sum_1 > r_sum_2 else variable_number_2
        print(f"r_max: {r_max}, delete {x.columns[delete_x_number]}")
        highly_correlated_variable_numbers.append(delete_x_number)
        r_in_x.loc[:, delete_x_number] = 0
        r_in_x.loc[delete_x_number, :] = 0

    return highly_correlated_variable_numbers

In [None]:
# Apply the feature selection
train_x_both_tmp = X_var_over0.copy()
for COR in [0.99]:#[0.99,0.95,0.9,0.85,0.8]:
    highly_correlated_variable_nums = search_highly_correlated_variables(train_x_both_tmp, COR)
    low_correlated_variables_TF = ~train_x_both_tmp.columns.isin(
        train_x_both_tmp.iloc[:, highly_correlated_variable_nums].columns)
    train_x_both_var_cor_tmp = train_x_both_tmp.loc[:, low_correlated_variables_TF]
    joblib.dump(list(train_x_both_var_cor_tmp.columns),
                f"../processed_data/tsfresh_features_var_{th}_r_{COR}.pkl")
    train_x_both_tmp = train_x_both_var_cor_tmp
    print(f"r<{COR}, features = {train_x_both_tmp.shape[1]}")