In [8]:
import pandas as pd
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import VarianceThreshold

In [3]:
df  = pd.read_csv('../data/processed/candy-data-processed.csv')
print('data shape', df.shape)
df.head()

data shape (85, 13)


Unnamed: 0,CandyName,Chocolate,Fruity,Caramel,PeanutyAlmondy,Nougat,CrispedRiceWafer,Hard,Bar,PluriBus,SugarPercent,PricePercent,WinPercent
0,100 Grand,1,0,1,0,0,1,0,1,0,0.732,0.86,66.971725
1,3 Musketeers,1,0,0,0,1,0,0,1,0,0.604,0.511,67.602936
2,One dime,0,0,0,0,0,0,0,0,0,0.011,0.116,32.261086
3,One quarter,0,0,0,0,0,0,0,0,0,0.011,0.511,46.116505
4,Air Heads,0,1,0,0,0,0,0,0,0,0.906,0.511,52.341465


In [4]:
# Categorize the variables
Target          = "WinPercent"
FlavorFeats     = ["Chocolate", "Fruity", "Caramel", "PeanutyAlmondy", "Nougat", "CrispedRiceWafer"]
FormFeats       = ["Hard", "Bar", "PluriBus"]
ContinuousFeats = ["SugarPercent", "PricePercent"]
Features        = FlavorFeats + FormFeats + ContinuousFeats

## Variance Threshold

Boolean features are Bernoulli random variables, and the variance of such variables is given by
<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mrow data-mjx-texclass="ORD">
    <mi data-mjx-auto-op="false">Var</mi>
  </mrow>
  <mo stretchy="false">[</mo>
  <mi>X</mi>
  <mo stretchy="false">]</mo>
  <mo>=</mo>
  <mi>p</mi>
  <mo stretchy="false">(</mo>
  <mn>1</mn>
  <mo>&#x2212;</mo>
  <mi>p</mi>
  <mo stretchy="false">)</mo>
</math>
so we can select using the threshold .8 * (1 - .8)

In [11]:
var_sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
var_sel.fit(df[FlavorFeats + FormFeats])
[f for f, selected in zip(FlavorFeats + FormFeats, var_sel.get_support()) if selected]

['Chocolate', 'Fruity', 'Bar', 'PluriBus']

P - value

In [6]:
f_statistic, p_values = f_regression(df[Features], df[Target])
feature_p_score_df = pd.DataFrame({'Feature': Features, 'F_statistic': f_statistic, 'p_value': p_values})
feature_p_score_df.sort_values(by='F_statistic', ascending=False, inplace=True)
feature_p_score_df['p_value_significant'] = feature_p_score_df['p_value'] < 0.05
feature_p_score_df

Unnamed: 0,Feature,F_statistic,p_value,p_value_significant
0,Chocolate,56.531812,5.859698e-11,True
7,Bar,18.820405,4.018635e-05,True
3,PeanutyAlmondy,16.400258,0.0001147208,True
1,Fruity,14.088953,0.0003222906,True
10,PricePercent,11.237827,0.001208829,True
5,CrispedRiceWafer,9.780646,0.002432112,True
6,Hard,8.848371,0.003840554,True
8,PluriBus,5.413605,0.02241804,True
9,SugarPercent,4.599871,0.0348979,True
2,Caramel,3.96076,0.04986443,True
