In [1]:
# Required libraries 
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

### 1. Identify columns that contain a single value <a class="anchor" id="section1"></a>

In [2]:
# Function 1
def loadFile(text_file):
    '''Loads a text file & returns an ndarray'''
    return(np.loadtxt(text_file, delimiter=','))

# Function 2 
def colValTotal(data):
    '''Prints column index number & the number of unique values within that column'''
    for i in range(data.shape[1]):
        x, y = i, len(np.unique(data[:, i]))
        print('{%i: %i}' % (x, y), end=' ')
        
# Function 3
def colValTotalLow(data):
    '''Prints column index number & the number of unique values within that column
       if the number of unique values in that column is less than 5'''
    for i in range(data.shape[1]):
        x, y = i, len(np.unique(data[:, i]))
        if y <= 5:
            print('> Column index: %i Value(s) count: %i' % (x, y))

### 2. Identify columns with very few unique values

In [3]:
# Function 4
def colValPercentages(data):
    '''Prints column index number, the number of unique values, & the percentage 
       of unique values out of all rows in that column'''
    for i in range(data.shape[1]):
        x, y = len(np.unique(data[:, i])), (float(len(np.unique(data[:, i]))) / data.shape[0] * 100)
        print('{%i: %i, %.1f%%}' % (i, x, y), end=' ')
        
# Function 5
def colValPercentagesLow(data):
    '''Prints column index number, the number of unique values, & the percentage 
       of unique values out of all rows in that column if unique values are less
       than 5% of the number of rows in that column'''
    for i in range(data.shape[1]):
        x, y = len(np.unique(data[:, i])), (float(len(np.unique(data[:, i]))) / data.shape[0] * 100)
        if y <= 5:
            print('{%i: %.1f%%}' % (i, y), end=' ')

### 3. Identify columns with low variance

In [4]:
# Function 6
def varianceTransform(data, X, list):
    '''
    Applies transform to remove all columns with a variance between 0.0 and 0.5
    
    Thresholds from 0.0 to 0.5 with a step size of 0.05 (e.g. 0.0, 0.05, 0.1) 
    saved to variable. The transform is defined using VarianceThreshold & then 
    applied to input data. The number of input features for each given threshold
    are also saved to a variable. Function then prints the number of features 
    in the transformed dataset for each given threshold (i.e. 0.0 - 0.5). The 
    results are stored in a list for future use (e.g. plotting a graph)
    
    Parameters
    ----------
    data : str
        pandas DataFrame
    X : str
        the input data
    list : list
        an empty list to store the result of variance transformation
    '''
    thresholds = np.arange(0.0, 0.55, 0.05)
    for i in thresholds: 
        transform = VarianceThreshold(threshold=i)
        data = transform.fit_transform(X)
        num_of_features = data.shape[1]
        print('> Threshold=%.2f, Number of features=%d' % (i, num_of_features))
        list.append(number_of_features)

### 4. Visualisation

In [5]:
# Function 7
def duplicateRows(dataframe):
    '''Higlights all duplicate rows in DataFrame'''
    rows_series = dataframe.duplicated(keep=False)
    rows = rows_series[rows_series].index.values
    return(dataframe.style.apply(lambda x: ['background: yellow' if x.name in rows else '' for i in x], axis=1))