In [None]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

In [None]:
import pandas as pd

In [None]:
#import data
wine_data = pd.read_csv('wine data.csv', sep=';', nrows=1000)

#view
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,74,7,0,19,76,11,34,9978,351,56,94,5
1,78,88,0,26,98,25,67,9968,32,68,98,5
2,78,76,4,23,92,15,54,997,326,65,98,5
3,112,28,56,19,75,17,60,998,316,58,98,6
4,74,7,0,19,76,11,34,9978,351,56,94,5


In [None]:
y = wine_data['quality']
x = wine_data.drop('quality', axis=1)

In [None]:
# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
#instance scaler
scaler = StandardScaler()

#fit to train data
scaler.fit(X_train)

#transform train and test data
X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

In [None]:
 #fit classifier
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train, y_train)

#print mean accuracy achieved on test set
rf_clf.score(X_test, y_test)

0.6633333333333333

In [None]:
#permutation feature importance
#clasifier+labels+features -> importance for every feature s

def feature_importance(clf, X, y, top_limit=None):
  bunch = permutation_importance(clf, X, y, n_repeats=50, random_state=42)

  #average feature importance
  imp_means = bunch.importances_mean

  #list with index of each feature in order of importance (descending)
  ordered_imp_means_args = np.argsort(imp_means)[::-1]

  #print all features if there is no top limit
  if top_limit is None:
    top_limit = len(ordered_imp_means_args)

  #print info
  for i, _ in zip(ordered_imp_means_args, range(top_limit)):
    name = x.columns[i]
    imp_score = imp_means[i]
    imp_std = bunch.importances_std[i]
    print(f"Feature {name} with index {i} has an average importance score of {imp_score: .3f} +/- {imp_std: .3f}\n")

In [None]:
help(permutation_importance)

Help on function permutation_importance in module sklearn.inspection._permutation_importance:

permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, n_jobs=None, random_state=None, sample_weight=None, max_samples=1.0)
    Permutation importance for feature evaluation [BRE]_.
    
    The :term:`estimator` is required to be a fitted estimator. `X` can be the
    data set used to train the estimator or a hold-out set. The permutation
    importance of a feature is calculated as follows. First, a baseline metric,
    defined by :term:`scoring`, is evaluated on a (potentially different)
    dataset defined by the `X`. Next, a feature column from the validation set
    is permuted and the metric is evaluated again. The permutation importance
    is defined to be the difference between the baseline metric and metric from
    permutating the feature column.
    
    Read more in the :ref:`User Guide <permutation_importance>`.
    
    Parameters
    ----------
    estimator : 

In [None]:
#apply function
feature_importance(rf_clf, X_train, y_train)

Feature alcohol with index 10 has an average importance score of  0.245 +/-  0.012

Feature volatile acidity with index 1 has an average importance score of  0.159 +/-  0.011

Feature total sulfur dioxide with index 6 has an average importance score of  0.143 +/-  0.011

Feature sulphates with index 9 has an average importance score of  0.127 +/-  0.011

Feature density with index 7 has an average importance score of  0.071 +/-  0.006

Feature citric acid with index 2 has an average importance score of  0.061 +/-  0.006

Feature chlorides with index 4 has an average importance score of  0.059 +/-  0.007

Feature free sulfur dioxide with index 5 has an average importance score of  0.048 +/-  0.007

Feature pH with index 8 has an average importance score of  0.048 +/-  0.006

Feature fixed acidity with index 0 has an average importance score of  0.047 +/-  0.006

Feature residual sugar with index 3 has an average importance score of  0.035 +/-  0.005



In [None]:
feature_importance(rf_clf, X_test, y_test)

Feature alcohol with index 10 has an average importance score of  0.133 +/-  0.022

Feature total sulfur dioxide with index 6 has an average importance score of  0.078 +/-  0.016

Feature sulphates with index 9 has an average importance score of  0.070 +/-  0.015

Feature volatile acidity with index 1 has an average importance score of  0.065 +/-  0.014

Feature chlorides with index 4 has an average importance score of  0.036 +/-  0.012

Feature pH with index 8 has an average importance score of  0.028 +/-  0.013

Feature citric acid with index 2 has an average importance score of  0.021 +/-  0.012

Feature free sulfur dioxide with index 5 has an average importance score of  0.020 +/-  0.013

Feature density with index 7 has an average importance score of  0.017 +/-  0.013

Feature fixed acidity with index 0 has an average importance score of  0.006 +/-  0.012

Feature residual sugar with index 3 has an average importance score of  0.002 +/-  0.011



In [None]:
#retrain model with top 4 features
print("On train split")
feature_importance(rf_clf, X_train, y_train, top_limit=4)

print("On test split")
feature_importance(rf_clf, X_test, y_test, top_limit=4)

On train split
Feature alcohol with index 10 has an average importance score of  0.245 +/-  0.012

Feature volatile acidity with index 1 has an average importance score of  0.159 +/-  0.011

Feature total sulfur dioxide with index 6 has an average importance score of  0.143 +/-  0.011

Feature sulphates with index 9 has an average importance score of  0.127 +/-  0.011

On test split
Feature alcohol with index 10 has an average importance score of  0.133 +/-  0.022

Feature total sulfur dioxide with index 6 has an average importance score of  0.078 +/-  0.016

Feature sulphates with index 9 has an average importance score of  0.070 +/-  0.015

Feature volatile acidity with index 1 has an average importance score of  0.065 +/-  0.014



In [None]:
#preserve only the top features
X_train_top_features = X_train[:,[1, 6, 9, 10]]
X_test_top_features = X_test[:,[1, 6, 9, 10]]


In [None]:
#retrain the model only with these features
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train_top_features, y_train)

In [None]:
#mean accuarcy
rf_clf.score(X_test_top_features, y_test)

0.6533333333333333

In [None]:
#preserve only the top features
X_train_top3_features = X_train[:,[6, 9, 10]]
X_test_top3_features = X_test[:,[6, 9, 10]]

In [None]:
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train_top3_features, y_train)

In [None]:
#mean accuarcy
rf_clf.score(X_test_top3_features, y_test)

0.6633333333333333

In [None]:
#preserve only the top features
X_train_top2_features = X_train[:,[6, 10]]
X_test_top2_features = X_test[:,[6, 10]]

#retrain the model only with these features
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train_top2_features, y_train)

#mean accuarcy
rf_clf.score(X_test_top2_features, y_test)

0.6