### ANOVA (F-test)

In [None]:
import pandas as pd
# Load data
aml = pd.read_csv('./cancer/aml/combined_data.csv', sep=",", on_bad_lines="skip")
# Transposer les données et mettre en forme le DataFrame
data = aml.transpose()
data.columns = data.iloc[0]
data = data[1:]
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
print(data.head())

Unnamed: 0 X..100130426 X..100133144 X..100134869  X..26823 X..340602  \
0             -16.60964     1.074992    -0.510026 -0.510026  1.074992   
1             -16.60964     3.096162     3.226726 -0.360657  0.639334   
2             -16.60964    -16.60964     2.798177  0.682672 -16.60964   
3             -16.60964     2.061434     1.932594  1.298019  0.883003   
4             -16.60964     5.038085     5.083975 -0.313818  0.686173   

Unnamed: 0 X..388795 X..391343 X..391714 X..441362 X..652919  ... hsa.mir.942  \
0           2.659972 -16.60964  1.490062 -0.510026  2.075023  ...    4.788536   
1            2.63935 -16.60964  1.224293 -0.360657    6.8941  ...    6.017075   
2           3.004629 -16.60964 -0.902362 -16.60964  5.272259  ...    4.970429   
3            2.10538 -16.60964 -0.702022 -16.60964  5.881016  ...    5.223956   
4           0.686173  3.773639  2.686167 -16.60964  7.297051  ...    5.318328   

Unnamed: 0 hsa.mir.943 hsa.mir.944 hsa.mir.95 hsa.mir.96 hsa.mir.98  \
0  

In [7]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

# Load data
X = data.iloc[:, :-2]
y = data.iloc[:, -1]

# Use ANOVA F-test
k = 1000
f_selector = SelectKBest(score_func=f_classif, k=k)
X_selected = f_selector.fit_transform(X, y)

# Get the selected feature indices, F-scores and p-values
selected_indices = f_selector.get_support(indices=True)
f_scores = f_selector.scores_[selected_indices]
p_values = f_selector.pvalues_[selected_indices]

# Convert to DataFrame
selected_features = pd.DataFrame({
    "Feature_Index": selected_indices,
    "F_Score": f_scores,
    "P_Value": p_values
}).sort_values(by="F_Score", ascending=False)

# Visuliaze the top 5 important features
print("Top 5 important features based on F-test (ANOVA):")
print(selected_features.head())

Top 5 important features based on F-test (ANOVA):
     Feature_Index    F_Score   P_Value
634           7954  24.481149  0.000002
870          10197  19.873582  0.000016
520           6940  18.582996  0.000029
986          11490  18.468042  0.000030
779           9310  17.993534  0.000038


  f = msb / msw


In [8]:
selected_columns = X.columns[selected_indices]
X_selected_df = X[selected_columns]
final_df = pd.concat([X_selected_df, y], axis=1)
print("Final dataset:", final_df.head())

Final dataset:   X..100130426 X..391343 ABCA17P.650655 ACTL9.284382 ACY3.91703 ADAMTS3.9508  \
0    -16.60964 -16.60964      -16.60964    -16.60964    6.55614    -16.60964   
1    -16.60964 -16.60964       1.224293    -16.60964   7.952238     9.706789   
2    -16.60964 -16.60964      -16.60964    -16.60964   9.001615     2.267659   
3    -16.60964 -16.60964      -16.60964    -16.60964   6.406532    -16.60964   
4    -16.60964  3.773639       1.686169    -16.60964   5.112438     4.386612   

  AFAP1L1.134265   AGT.183 ALDH1A2.8854 ANGPTL3.27329  ... hsa.mir.598  \
0       1.490062 -0.510026       2.2974      0.490067  ...    3.110465   
1        7.50972   2.63935     1.639376       3.22432  ...    1.373224   
2      -16.60964 -16.60964     1.419652     -16.60964  ...    3.107934   
3       5.155988 -16.60964    -0.702022      2.467934  ...     3.80892   
4       4.078491 -0.313818    -16.60964      1.686169  ...    1.926013   

  hsa.mir.600 hsa.mir.656 hsa.mir.660 hsa.mir.665 hsa.mir.7

In [9]:
# Save the selected features to a CSV file
final_df.to_csv("./reduced_dataset/aml_anova.csv", index=False)

In [10]:
df = pd.read_csv("./reduced_dataset/aml_anova.csv")
print(df.head())

   X..100130426  X..391343  ABCA17P.650655  ACTL9.284382  ACY3.91703  \
0     -16.60964 -16.609640      -16.609640     -16.60964    6.556140   
1     -16.60964 -16.609640        1.224293     -16.60964    7.952238   
2     -16.60964 -16.609640      -16.609640     -16.60964    9.001615   
3     -16.60964 -16.609640      -16.609640     -16.60964    6.406532   
4     -16.60964   3.773639        1.686169     -16.60964    5.112438   

   ADAMTS3.9508  AFAP1L1.134265    AGT.183  ALDH1A2.8854  ANGPTL3.27329  ...  \
0    -16.609640        1.490062  -0.510026      2.297400       0.490067  ...   
1      9.706789        7.509720   2.639350      1.639376       3.224320  ...   
2      2.267659      -16.609640 -16.609640      1.419652     -16.609640  ...   
3    -16.609640        5.155988 -16.609640     -0.702022       2.467934  ...   
4      4.386612        4.078491  -0.313818    -16.609640       1.686169  ...   

   hsa.mir.598  hsa.mir.600  hsa.mir.656  hsa.mir.660  hsa.mir.665  \
0     3.110465  

# END