<a href="https://colab.research.google.com/github/Santanukolkata/Data_Science/blob/master/Models/Preprocessing/Wrapper_SFS_wine_quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
df_wine =pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",sep=';')
df_wine.head(2)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5


In [2]:
X=df_wine.drop(columns="quality")
Y=df_wine["quality"]
Y_bin=df_wine["quality"].apply(lambda x:1 if x >7 else 0 )
Y_bin.head(2)

0    0
1    0
Name: quality, dtype: int64

In [3]:
import statsmodels.api as sm
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(list(initial_features))>0):
        remaining_features = list(set(initial_features)-set(best_features))
        print(remaining_features)
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.Logit(target, data[best_features+[new_column]]).fit()
            new_pval[new_column] = model.pvalues[new_column]
        print(new_pval)
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
            print('best_features',best_features)
        else:
            break
    return best_features

print(forward_selection(X,Y_bin,0.3))

['chlorides', 'volatile acidity', 'free sulfur dioxide', 'sulphates', 'alcohol', 'density', 'total sulfur dioxide', 'fixed acidity', 'pH', 'citric acid', 'residual sugar']
Optimization terminated successfully.
         Current function value: 0.059044
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.063758
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.096286
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.075455
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.070783
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.061624
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.088837
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.067458
         Iterations 8
Optimization

In [0]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LogisticRegression  
# Sequential Forward Selection(sfs)
sfs = SFS(LogisticRegression(),
           k_features=11,
           forward=True,
           floating=False,
           scoring = 'accuracy',
          verbose=2,
           cv = 0)

In [13]:
sfs.fit(X, Y_bin)
sfs.k_feature_names_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.1s finished

[2020-03-20 04:45:03] Features: 1/11 -- score: 0.9887429643527205[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished

[2020-03-20 04:45:03] Features: 2/11 -- score: 0.9887429643527205[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.1s finished

[2020-03-20 04:45:03] Features: 3/11 -- score: 0.9887429643527205[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

('fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol')

In [14]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
sfs1 = SFS(clf,
           k_features=11,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy')
sfs1 = sfs1.fit(X, Y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:   11.3s finished

[2020-03-20 04:52:09] Features: 1/11 -- score: 0.5209247648902822[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   12.6s finished

[2020-03-20 04:52:22] Features: 2/11 -- score: 0.48656543887147335[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   11.0s finished

[2020-03-20 04:52:33] Features: 3/11 -- score: 0.5196806426332288[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 

In [0]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
df_wine.columns[feat_cols]