# Selección de variables filter

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

Los selectores de variables filter se basan en una métrica estatística para realizar un ranking entre las variables. Estarán en primer lugar las más relevantes y en último lugar las menos relevantes. Después se escogerán las k mejores.

Algunas de las métricas más usadas para establecer estos ranking son Mutual information, ANOVA F-test y chi cuadrado, que están todas disponibles en la librería de scikit-learn. Cabe mencionar que chi-cuadrado es solo usable cuando las variables son positivas.

Scikit-learn permite escoger las mejores k variables, o escoger en base a un percentil.

In [54]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [55]:
data.shape

(2000, 21)

In [56]:
data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

Vamos a dividir los datos que usaremos como predictores, de los que usaremos como variable objetivo. En este caso escogeremos la última variable como variable objetivo.

In [57]:
X = data.iloc[:, 0:20]
y = data.iloc[:, -1]

In [58]:
X

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,668,13,4,19,1,1,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,2032,11,10,16,1,1,1
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,3057,9,1,5,1,1,0
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,869,18,10,19,1,1,1


In [59]:
y

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

In [60]:
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [61]:
from sklearn.feature_selection import f_classif
bestfeatures = SelectKBest(score_func=f_classif, k=10)
fit = bestfeatures.fit(X, y)

In [62]:
featureScores = pd.DataFrame(columns = ['vars', 'scores'])
featureScores['vars'] = X.columns
featureScores['scores'] = fit.scores_
featureScores

Unnamed: 0,vars,scores
0,battery_power,31.598158
1,blue,0.476768
2,clock_speed,0.493708
3,dual_sim,0.428239
4,fc,0.772182
5,four_g,1.059525
6,int_memory,2.922996
7,m_dep,1.500682
8,mobile_wt,3.594318
9,n_cores,2.625415


Vamos a seleccional los k mejores. En este caso vamos a establecer k como k=10. Filtramos usando las funciones de pandas y obtenemos las 10 variables más importantes según nuestro selector de variables. 

In [63]:
featureScores.nlargest(10, 'scores')

Unnamed: 0,vars,scores
13,ram,3520.110824
0,battery_power,31.598158
12,px_width,22.620882
11,px_height,19.484842
8,mobile_wt,3.594318
6,int_memory,2.922996
9,n_cores,2.625415
14,sc_h,2.225984
15,sc_w,1.671
16,talk_time,1.628811


In [64]:
list_filter = featureScores.nlargest(10, 'scores')['vars'].values
list_filter

array(['ram', 'battery_power', 'px_width', 'px_height', 'mobile_wt',
       'int_memory', 'n_cores', 'sc_h', 'sc_w', 'talk_time'], dtype=object)

# Selección de variables wrapper

La librería de scikit-learn proporciona un conjunto de datos con el que se puede trabajar de manera libre. y sin tener que importar ningún CSV local

In [65]:
!pip install statsmodels



In [66]:
import pandas as pd
# from sklearn.datasets import load_boston
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [67]:
# data_url = "http://lib.stat.cmu.edu/datasets/boston"
# raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
# data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
# target = raw_df.values[1::2, 2]

boston = pd.read_csv('data/HousingData.csv')

print(boston.shape)         # for dataset dimension
print(boston.columns)      # for feature names
print(boston['CRIM'])             # for target variable
# print(boston.DESCR)              # for data description

(506, 14)
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')
0      0.00632
1      0.02731
2      0.02729
3      0.03237
4      0.06905
        ...   
501    0.06263
502    0.04527
503    0.06076
504    0.10959
505    0.04741
Name: CRIM, Length: 506, dtype: float64


La variable objetivo es el crimen 

In [68]:
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,486.0,486.0,486.0,486.0,506.0,506.0,486.0,506.0,506.0,506.0,506.0,506.0,486.0,506.0
mean,3.611874,11.211934,11.083992,0.069959,0.554695,6.284634,68.518519,3.795043,9.549407,408.237154,18.455534,356.674032,12.715432,22.532806
std,8.720192,23.388876,6.835896,0.25534,0.115878,0.702617,27.999513,2.10571,8.707259,168.537116,2.164946,91.294864,7.155871,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.0819,0.0,5.19,0.0,0.449,5.8855,45.175,2.100175,4.0,279.0,17.4,375.3775,7.125,17.025
50%,0.253715,0.0,9.69,0.0,0.538,6.2085,76.8,3.20745,5.0,330.0,19.05,391.44,11.43,21.2
75%,3.560263,12.5,18.1,0.0,0.624,6.6235,93.975,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [69]:
boston.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX          int64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

## Forward selection

De manera esquemática y sencilla:
1. Escoger un threshold de corte (nivel de significancia)
2. Modelar todos los posibles modelos con una variable de cada vez. En total tantos modelos como variables tenemos
3. Realizar todos los posibles modelos, con la mejor que encontramos en el anterior paso, y una a mayores
4. Realizar otra vez los anteriores pasos siempre aumentando una variables y comprobando el valor de significancia

In [70]:
boston = boston.dropna()

X = boston.drop('CRIM', 1)
y = boston['CRIM']

In [71]:
def forward_selection(data, target, significance_level=0.01):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [72]:
selection1 = forward_selection(X,y)
selection1

['RAD', 'LSTAT']

## Backward elimination

De manera esquemática:
1. Escoger un threshold
2. Usar un modelo usando todas las variables
3. Considerar la varaible con el p-value más alto. Si el p-value es mayor que el nivel de significancia escogido, entonces seguimos, sino devolvemos la selección hasta el momento
4. Sacar la variable que estamos considerando del modelo
5. Modelar el modelo sin la variable y repetir desde paso 3

In [73]:
def backward_elimination(data, target, significance_level = 0.01):
    features = data.columns.tolist()
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features])
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max()
        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
        else:
            break 
    return features

In [74]:
selection2 = backward_elimination(X, y)
selection2

['ZN', 'DIS', 'RAD', 'MEDV']

In [75]:
print('Filter: ', list(list_filter))
print('W. F: ', selection1)
print('W. B: ', selection2)

Filter:  ['ram', 'battery_power', 'px_width', 'px_height', 'mobile_wt', 'int_memory', 'n_cores', 'sc_h', 'sc_w', 'talk_time']
W. F:  ['RAD', 'LSTAT']
W. B:  ['ZN', 'DIS', 'RAD', 'MEDV']


In [76]:
print(set(selection1) - set(selection2))
print(set(selection2) - set(selection1))

{'LSTAT'}
{'ZN', 'DIS', 'MEDV'}


### Ejercicio 1

Aplica selección de variables de tipo wrapper (ambos forward y backward) sobre el dataset train.csv y compara los resultados con los obtenidos con la aproximación filter.


In [77]:
from sklearn.feature_selection import f_classif

### Ejercicio 2

Aplica selección de variables filter sobre el dataset de boston y compara los resultados con los obtenidos con la aproximación wrapper. Valora el uso de otras métricas diferentes a chi2