In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import RFE

In [2]:
def read_and_get_data():
    train_data = pd.read_csv('./training.csv')
    test_data = pd.read_csv('./testing.csv')

    x_train = train_data.values[:,2:28]
    y_train = train_data['Appliances'].values

    x_test = test_data.values[:,2:28]
    y_test = test_data['Appliances'].values
    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = read_and_get_data()

### by removing features with low variance

In [3]:
def fea_sel_variancethreshold(x_pre):
    sel = VarianceThreshold(threshold=(.8*(1-.8)))
    x_post = sel.fit_transform(x_pre)
    return x_post

### by selection best features based on univatiate statistical tests.

In [4]:

def Kbest_by_f_regressor(num,x_pre,y):
    x_post = SelectKBest(f_regression, k=num).fit_transform(x_pre, y)
    return x_post

def Kbest_by_mutual_info(num,x_pre,y):
    x_post = SelectKBest(mutual_info_regression, k=num).fit_transform(x_pre, y)
    return x_post

def perc_by_f_regressor(num,x_pre,y):
    if(num > 100 or num < 0):
        print('wrong percentage: {}%'.format(num))
        return
    x_post = SelectPercentile(f_regression, percentile=num).fit_transform(x_pre, y)
    return x_post

def perc_by_mutual_info(num,x_pre,y):
    if(num > 100 or num < 0):
        print('wrong percentage: {}%'.format(num))
        return
    x_post = SelectPercentile(mutual_info_regression, percentile=num).fit_transform(x_pre, y)
    return x_post

### by an external estimator

In [5]:
def recursively_sel(regressor, num, x_pre, y):
    rfe = RFE(estimator=regressor, n_features_to_select=num, step = 1)
    rfe.fit(x_pre,y)
    x_post = rfe.transform(x_pre)
    return x_post

In [6]:
x_transformed = fea_sel_variancethreshold(x_train)
print(x_transformed)

[[30 19.89 47.5966666666667 ... 63.0 5.3 13.275433157104999]
 [30 19.89 46.6933333333333 ... 59.1666666666667 5.2 18.606194981839508]
 [30 19.89 46.3 ... 55.3333333333333 5.1 28.64266816759482]
 ...
 [0 25.5 46.5 ... 24.5 13.3 49.28293972043321]
 [10 25.5 46.99 ... 26.166666666666693 13.233333333333302
  6.322783650830388]
 [10 25.5 46.6 ... 27.0 13.2 34.11885058740153]]


### below call different functions to do feature selection

In [7]:
x_transformed = Kbest_by_f_regressor(2,x_train,y_train)
print(x_transformed)

[[30 92.0]
 [30 92.0]
 [30 92.0]
 ...
 [0 56.0]
 [10 56.6666666666667]
 [10 57.0]]


In [8]:
x_transformed = Kbest_by_mutual_info(15,x_train,y_train)
print(x_transformed)



[[19.89 47.5966666666667 19.2 ... 17.033333333333303 45.53 733.5]
 [19.89 46.6933333333333 19.2 ... 17.0666666666667 45.56 733.6]
 [19.89 46.3 19.2 ... 17.0 45.5 733.7]
 ...
 [25.5 46.5 25.754 ... 23.2 46.79 755.2]
 [25.5 46.99 25.414 ... 23.2 46.8175 755.2]
 [25.5 46.6 25.264285714285695 ... 23.2 46.845 755.2]]


In [9]:
x_transformed = perc_by_f_regressor(80,x_train,y_train)
print(x_transformed)

[[30 19.89 47.5966666666667 ... 733.5 92.0 7.0]
 [30 19.89 46.6933333333333 ... 733.6 92.0 6.6666666666666705]
 [30 19.89 46.3 ... 733.7 92.0 6.3333333333333295]
 ...
 [0 25.5 46.5 ... 755.2 56.0 3.5]
 [10 25.5 46.99 ... 755.2 56.6666666666667 3.8333333333333295]
 [10 25.5 46.6 ... 755.2 57.0 4.0]]


In [10]:
x_transformed = perc_by_mutual_info(90,x_train,y_train)
print(x_transformed)



[[30 19.89 47.5966666666667 ... 733.5 92.0 5.3]
 [30 19.89 46.6933333333333 ... 733.6 92.0 5.2]
 [30 19.89 46.3 ... 733.7 92.0 5.1]
 ...
 [0 25.5 46.5 ... 755.2 56.0 13.3]
 [10 25.5 46.99 ... 755.2 56.6666666666667 13.233333333333302]
 [10 25.5 46.6 ... 755.2 57.0 13.2]]


In [11]:
from sklearn.ensemble import RandomForestRegressor
# below rf can be changed to a linear regressor or a neural network regressor
regressor = RandomForestRegressor(n_estimators=100, max_depth=None)
x_transformed = recursively_sel(regressor, 15, x_train, y_train)
print(x_transformed)

[[47.5966666666667 19.2 44.79 ... 45.53 733.5 92.0]
 [46.6933333333333 19.2 44.7225 ... 45.56 733.6 92.0]
 [46.3 19.2 44.6266666666667 ... 45.5 733.7 92.0]
 ...
 [46.5 25.754 42.08 ... 46.79 755.2 56.0]
 [46.99 25.414 43.036 ... 46.8175 755.2 56.6666666666667]
 [46.6 25.264285714285695 42.97142857142861 ... 46.845 755.2 57.0]]
