# ANOVA for continuous variable feature selection


Anova provides a comparison of the means of two groups.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_classif, f_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile



In [14]:
data = 'pulsar_data_train.csv'
df = pd.read_csv(data)
df.columns = df.columns.str.strip()
df = df.dropna()

X_train = df.iloc[:, :-1]
y_train = df["target_class"]

print(X_train)


       Mean of the integrated profile  \
0                          121.156250   
1                           76.968750   
2                          130.585938   
4                           84.804688   
7                          109.406250   
...                               ...   
12522                      124.828125   
12523                      124.312500   
12525                      116.031250   
12526                      135.664062   
12527                      120.726562   

       Standard deviation of the integrated profile  \
0                                         48.372971   
1                                         36.175557   
2                                         53.229534   
4                                         36.117659   
7                                         55.912521   
...                                             ...   
12522                                     50.586731   
12523                                     53.179053   
12525       

# Initial data pruning.
Much like from the feature selection book, we will remove any constant, quasi constant, and correlated features from our featureset


In [25]:
const_feature = VarianceThreshold(threshold=0.01)
const_feature.fit(X_train, y_train)

X_train_new = const_feature.transform(X_train)
#now compare the two feature sets
print(X_train.shape)
print(X_train_new.shape)

#there are no quasi constant or constant features in the dataset


(9273, 8)
(9273, 8)


In [22]:
X_train.duplicated().sum()
#no duplicate rows
#can grab index of duplicated features if we use
# dup = X_train.duplicated()
# features_to_keep = [for index for index in dup]

#this gives us a list of feautres for which they are unique. can be grabbed 
#from dataframe

0

### Applying the F-test : ANOVA


In [27]:
fscore, pvalue = f_classif(X_train_new, y_train)
print(fscore)
print(pvalue)

[ 7794.21864257  1454.21624065 15482.64597577  9148.07850795
  1841.09440076  2979.42155187  1666.61111384   663.47197755]
[0.00000000e+000 1.04699406e-295 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 0.00000000e+000 2.26971319e-141]


well this data is not good for this, so getting a new data set

In [31]:
df = pd.read_csv("pima-indians-diabetes.csv")
df.head()

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Initial Pruning... Again


In [43]:
const_feature = VarianceThreshold(threshold=0.01)
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1],
                                                   df["class"], test_size=0.33, random_state=1)
const_feature.fit(X_train, y_train)
X_train_new = const_feature.transform(X_train)

print(X_train.shape)
print(X_train_new.shape)

(514, 8)
(514, 8)


Again, no features are eliminated from the original screening


In [44]:
fscore, pvalue = f_classif(X_train_new, y_train)
pvalue = pd.Series(pvalue)
pvalue.index = df.drop(columns=["class"]).columns
print(fscore)
print(pvalue.sort_values(ascending=True))

[1.65273845e+01 1.31325562e+02 4.23711464e-02 1.41521551e+00
 1.27789661e+01 4.92095231e+01 1.33771423e+01 2.51264397e+01]
glucose                     3.189502e-27
bmi                         7.317218e-12
age                         7.409770e-07
pregnancies                 5.549782e-05
diabetespedigreefunction    2.810699e-04
insulin                     3.837293e-04
skinthickness               2.347439e-01
bloodpressure               8.369957e-01
dtype: float64


Now writing a KNN classifier, lets see if the anova analysis of the features yields  better results when we take the k best features