In [1]:
# Install the Library
#pip install collinearity==0.6.1

Collecting collinearity==0.6.1
  Downloading collinearity-0.6.1.tar.gz (5.0 kB)
Building wheels for collected packages: collinearity
  Building wheel for collinearity (setup.py): started
  Building wheel for collinearity (setup.py): finished with status 'done'
  Created wheel for collinearity: filename=collinearity-0.6.1-py3-none-any.whl size=4466 sha256=33e61a1a6649297b82355ae2ee644538e972ab0123f7aad6bc49654892fe8d07
  Stored in directory: c:\users\ujwal\appdata\local\pip\cache\wheels\4e\0f\c8\efd9c4ae0abdb392be9b9ce5a38b41ccd2d519b5367ecce570
Successfully built collinearity
Installing collected packages: collinearity
Successfully installed collinearity-0.6.1



## Objects in collinearity
The main object is SelectNonCollinear. It can be imported this way:

from collinearity import SelectNonCollinear
collinearity.SelectNonCollinear(correlation_threshold=0.4, scoring=f_classif)

Parameters:

correlation_threshold : float (between 0 and 1), default = 0.4

Only those features that produce a correlation matrix with off-diagonal elements that are, in absolute value, less than this threshold will be chosen.

scoring : callable, default=f_classif

The scoring function for supervised problems. It must be the same accepted by sklearn.feature_selection.SelectKBest.

## Methods in collinearity
This object supports the main methods of scikit-learn Estimators:

fit(X,y=None)

Identifies the features to consider. For supervised problems, y is the target array and the algorithm is:

Sort the features by scoring descending
Take the most important feature (i.e. the first feature)
Take the next feature if it shows a linear correlation coefficient with the already selected feature that is, in absolute value, lower than the threshold
Keep adding features as long as the correlation constraint holds
For unsupervised problems, we have y = None and the algorithm is:

Take the couple of features that have the lowest absolute value of the linear correlation coefficient.
If it's lower than the threshold, consider these features
Keep adding features as long as the correlation matrix doesn't show off-diagonal elements whose absolute value is greater than the threshold.
transform(X)

Selects the features according to the result of fit. It must be called after fit.

fit_transform(X,y=None)

Calls fit and then transform

get_support()

Returns an array of True and False of size X.shape[1]. A feature is selected if the value on this array corresponding to its index is True, otherwise it's not selected.

Examples

In [2]:
from collinearity import SelectNonCollinear

In [3]:
from collinearity import SelectNonCollinear
from sklearn.feature_selection import f_regression
import numpy as np
from sklearn.datasets import load_diabetes

X,y = load_diabetes(return_X_y=True)

## Unsupervised problem 

In [5]:
# Taking the threshold as 0.3

selector = SelectNonCollinear(0.3)

In [None]:
# Applying selection to the features and calculating the correlation matrix

In [6]:
np.corrcoef(selector.fit_transform(X),rowvar=False)
# no off-diagonal element is greater than the threshold.

array([[1.        , 0.1737371 , 0.18508467, 0.26006082],
       [0.1737371 , 1.        , 0.0881614 , 0.03527682],
       [0.18508467, 0.0881614 , 1.        , 0.24977742],
       [0.26006082, 0.03527682, 0.24977742, 1.        ]])

## Supervised problems

In [7]:
# Must to set the value of the 'scoring' argument in the constructor

selector = SelectNonCollinear(correlation_threshold=0.4,scoring=f_regression)
selector.fit(X,y)


In [8]:
np.corrcoef(selector.transform(X),rowvar=False)
# Again, no off-diagonal element is greater than the threshold in absolute value.

array([[1.        , 0.1737371 , 0.18508467, 0.26006082],
       [0.1737371 , 1.        , 0.0881614 , 0.03527682],
       [0.18508467, 0.0881614 , 1.        , 0.24977742],
       [0.26006082, 0.03527682, 0.24977742, 1.        ]])

## Use in pipelines

Its possible to use 'SelectNonCollinear' inside a pipeline, if neccesary.

In [None]:
#pipeline = make_pipeline(SelectNonCollinear(correlation_threshold=0.4, scoring=f_regression), LinearRegression())