-
Notifications
You must be signed in to change notification settings - Fork 2
/
__init__.py
201 lines (150 loc) · 7.35 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.linalg import eigh
from ec_feature_selection.utils import check_array
from ec_feature_selection._ecfs_functions import get_fisher_score, get_mutual_information, build_kernel, build_sigma
from typing import Optional, Union
Data = Union[np.array, pd.DataFrame, pd.Series, sparse.spmatrix]
class ECFS:
"""
Feature Ranking and Selection via Eigenvector Centrality is a graph-based
method for feature selection that ranks feature by identifying the most important ones.
References
--------
Based on the algorithm as introduced in:
Roffo, G. and Melzi, S., 2017, July. Ranking to learn: Feature ranking and selection via eigenvector centrality.
In New Frontiers in Mining Complex Patterns: 5th International Workshop,
NFMCP 2016, Held in Conjunction with ECML-PKDD 2016, Riva del Garda, Italy, September 19, 2016,
Revised Selected Papers (Vol. 10312, p. 19). Springer.
% @InProceedings{RoffoECML16,
% author={G. Roffo and S. Melzi},
% booktitle={Proceedings of New Frontiers in Mining Complex Patterns (NFMCP 2016)},
% title={Features Selection via Eigenvector Centrality},
% year={2016},
% keywords={Feature selection;ranking;high dimensionality;data mining},
% month={Oct}}
This Python implementation is inspired by the 'Feature Ranking and Selection via Eigenvector Centrality'
MATLAB implementation that can be found in the Feature Selection Library (FSLib) by Giorgio Roffo.
Many more Feature Selection methods are also available in the Feature Selection Library:
https://www.mathworks.com/matlabcentral/fileexchange/56937-feature-selection-library
Parameters
----------
n_features : int > 0 and lower than the m original features, or None (default=None)
Number of features to select.
If n_features is set to None all features are kept and a ranked dataset is returned.
epsilon : int or float >=0 (default 1e-5)
A small number. Used for avoiding division by zero.
Attributes
----------
n_features : int
Number of features to select.
epsilon : float >=0 (default 1e-5)
A small number. Used for avoiding division by zero.
alpha : int or float ∈ [0, 1]
Loading coefficent.
The adjacency matrix A is given by: A = (alpha * kernel) + (1 − alpha) * sigma
positive_class : int, float, or str (default 1)
Label of the positive class.
negative : int, float, or str (default -1)
Label of the negative class.
fisher_score: numpy array, shape (m_features,)
The fisher scores for each feature.
mutual_information: float
Mutual Information of X and y.
A: numpy array (m_features, m_features)
The adjacency matrix.
eigenvalues: numpy array (n_features,)
The eigenvalues of the adjacency matrix.
eigenvectors: numpy array (n_features, n_features)
The eigenvectors of the adjacency matrix.
ranking: numpy array (n_features,)
Ranking of features (0 is the most important feature, 1 is 2nd most imporant etc... ).
"""
def __init__(self, n_features: Optional[int] = None, epsilon: float = 1e-5) -> None:
self.n_features = n_features
self.epsilon = epsilon
def fit(self, X: Data, y: Data, alpha: Union[int, float], positive_class: Union[int, float, str],
negative_class: Union[int, float, str]) -> None:
"""
Computes the feature ranking from the training data.
Parameters
----------
X : array-like, shape (n_samples, m_features)
Training data to compute the feature importance scores from.
y : array-like, shape (n_samples,)
Training labels.
alpha : int or float ∈ [0, 1]
Loading coefficent.
The adjacency matrix A is given by: A = (alpha * kernel) + (1 − alpha) * sigma
positive_class : int, float, or str
Label of the positive class.
negative_class : int, float, or str
Label of the negative class.
Returns
-------
self : object
Returns the instance itself.
"""
X = check_array(X)
y = check_array(y)
assert X.shape[0] == y.shape[0], 'X and y should have the same number of samples. {} != {}'.format(X.shape[0],
y.shape[0])
self.positive_class = positive_class
self.negative_class = negative_class
self.alpha = alpha
self.fisher_score = get_fisher_score(X=X, y=y, negative_class=self.negative_class,
positive_class=self.positive_class, epsilon=self.epsilon)
self.mutual_information = np.apply_along_axis(get_mutual_information, 0, X, y, self.epsilon)
self.kernel = build_kernel(self.fisher_score, self.mutual_information)
self.sigma = build_sigma(X)
self.A = self.alpha * self.kernel + (1 - self.alpha) * self.sigma
self.eigenvalues, self.eigenvectors = eigh(self.A)
self.ranking = np.abs(self.eigenvectors[:, self.eigenvalues.argmax()]).argsort()[::-1]
def transform(self, X: Data) -> np.ndarray:
"""
Reduces the feature set down to the top n_features.
Parameters
----------
X: array-like (n_samples, m_features)
Data to perform feature ranking and selection on.
Returns
-------
X_ranked: array-like (n_samples, n_top_features)
Reduced feature matrix.
"""
if self.n_features:
if self.n_features > X.shape[1]:
raise ValueError(
'Number of features to select is higher than the original number of features. {} > {}'.format(
self.n_features, X.shape[1]))
X_ranked = X[:, self.ranking]
if self.n_features is not None:
X_ranked = X_ranked[:, :self.n_features]
return X_ranked
def fit_transform(self, X: Data, y: Data, alpha: Union[int, float], positive_class: Union[int, float, str],
negative_class: Union[int, float, str]) -> np.ndarray:
"""
Computes the feature ranking from the training data, then reduces
the feature set down to the top n_features.
Parameters
----------
X : array-like, shape (n_samples, m_features)
Training data to compute the feature importance scores from
and to perform feature ranking and selection on.
y : array-like, shape (n_samples)
Training labels.
alpha : int or float ∈ [0, 1]
Loading coefficent.
The adjacency matrix A is given by: A = (alpha * kernel) + (1 − alpha) * sigma
positive_class : int, float, or str
Label of the positive class.
positive_class : int, float, or str
Label of the negative class.
Returns
-------
X_ranked: array-like (n_samples, n_top_features)
Reduced feature matrix.
"""
self.fit(X, y, alpha, positive_class, negative_class)
return self.transform(X)