<a href="https://colab.research.google.com/github/Priyo-prog/Statistics-and-Data-Science/blob/main/Feature%20Selection%20Complete/Filter%20Methods/quasi_constant_removal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Quasi-Constant Removal**

Quasi-constants are the values in the feature where almost all the values are same

First remove the constant features ann then using sklearn VarianceThreshold remove the quasi constant

In [1]:
# Import the important libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [2]:
# Import the dataset from google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
filename = "/content/drive/MyDrive/Data Science/Feature Selection/dataset_1.csv"

In [4]:
df = pd.read_csv(filename)
df.head(5)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
0,0,0,0.0,0.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
1,0,0,0.0,3.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
2,0,0,0.0,5.88,0.0,0,0,0,0,0,...,0.0,0,0,3,0,0,0,0.0,67772.7216,0
3,0,0,0.0,14.1,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
4,0,0,0.0,5.76,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0


In [6]:
# Separate the data in train, test split
X = df.drop(labels="target", axis=1)
y = df["target"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
# Remove the constants first before eleminating the quasi-constants
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]
len(constant_features)

34

In [9]:
X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
X_train.shape, X_test.shape

((35000, 266), (15000, 266))

In [10]:
# Create an instance of VarianceThreshold
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train)

In [12]:
# get_support is a boolean vector that indicates which features
# are retained, that is, which features have a higher variance than
# the threshold we indicated.
sel.get_support()

array([False, False,  True,  True,  True,  True, False,  True, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True, False,  True,  True,  True,  True,  True, False, False,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True,

In [16]:
sum(sel.get_support())

215

In [18]:
quasi_constant = X_train.columns[~sel.get_support()]
quasi_constant, len(quasi_constant)

(Index(['var_1', 'var_2', 'var_7', 'var_9', 'var_10', 'var_19', 'var_28',
        'var_36', 'var_43', 'var_45', 'var_53', 'var_56', 'var_59', 'var_66',
        'var_67', 'var_69', 'var_71', 'var_104', 'var_106', 'var_116',
        'var_133', 'var_137', 'var_141', 'var_146', 'var_177', 'var_187',
        'var_189', 'var_194', 'var_197', 'var_198', 'var_202', 'var_218',
        'var_219', 'var_223', 'var_233', 'var_234', 'var_235', 'var_245',
        'var_247', 'var_249', 'var_250', 'var_251', 'var_256', 'var_260',
        'var_267', 'var_274', 'var_282', 'var_285', 'var_287', 'var_289',
        'var_298'],
       dtype='object'),
 51)

In [20]:
# percentage of observations showing each of the different values
# of the variable
X_train["var_1"].value_counts() / np.float(len(X_train))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_train["var_1"].value_counts() / np.float(len(X_train))


0    0.999629
3    0.000200
6    0.000143
9    0.000029
Name: var_1, dtype: float64

In [25]:
X_train["var_2"].value_counts() / np.float(len(X_train))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X_train["var_2"].value_counts() / np.float(len(X_train))


0    0.999971
1    0.000029
Name: var_2, dtype: float64

We can now remove the quasi-constant using the transform of VarianceThreshold.
But this will return a Numpy array. If we want the pandas dataframe then we need to reconstitute it

In [26]:
# capture the feature names first and then remove the quasi constant
feat_names = X_train.columns[sel.get_support()]

In [27]:
# Remove the quasi-constants from the datasets
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

In [39]:
X_train[0:5] # it has now returned a numpy Array

array([[0.00000e+00, 2.79000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [0.00000e+00, 2.97000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [0.00000e+00, 2.79000e+00, 8.54352e+04, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [0.00000e+00, 5.70000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [41]:
# Re-constitute the array to pandas dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)
X_test = pd.DataFrame(X_test, columns=feat_names)

In [42]:
X_train.head(5)

Unnamed: 0,var_3,var_4,var_5,var_6,var_8,var_11,var_12,var_13,var_14,var_15,...,var_286,var_288,var_290,var_291,var_292,var_293,var_295,var_296,var_299,var_300
0,0.0,2.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,2.79,85435.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,5.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculating the quasi-constant manually without the sklearn

In [43]:
# Once again separate the train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

In [55]:
# First remove the constant values
constant_values = [feat for feat in X_train.columns if X_train[feat].std() == 0]
len(constant_values)

34

In [56]:
# Remove the constant features
X_train = X_train.drop(labels=constant_values, axis=1)
X_test.drop(labels=constant_values, axis=1, inplace=True)

In [58]:
# create a list of quasi-constant feature
quasi_constant_features = []

for feature in X_train.columns:

  # find the predominant values, that is the value
  # that is shared the most by the features
  # Also avoiding the 'np.float' for deprecated warning
  predominant = (X_train[feature].value_counts() /
                 float(len(X_train))).sort_values(ascending=False).values[0]

  if predominant > 0.998:

    # if 'yes' then add it to the list
    quasi_constant_features.append(feature)

len(quasi_constant_features)

108

In [59]:
# Now drop the quasi-constant features
X_train.drop(labels=quasi_constant_features, axis=1, inplace=True)
X_test.drop(labels=quasi_constant_features, axis=1, inplace=True)