In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel('model_dataset.xlsx')
df = df.dropna()

In [3]:
df[['shell_color','model', 'time_description']] = df[['shell_color','model', 'time_description']].apply(lambda x: pd.factorize(x)[0])
df['hour'] = df['end_time'].dt.hour
df.drop(['start_time','end_time','t_stamp','stamp_date'],inplace=True, axis=1)

In [4]:
df.head(1)

Unnamed: 0,serial#,blister,shell_color,model,time_description,iso_press_avg,poly_press_avg,booth,iso_lbs_run_usage,poly_lbs_run_usage,spray_time,Avg Temp,Avg Dew Point,Avg Humidity,Avg Pressure,Total Precipitation,hour
0,281689,0,0,0,0,19.999573,1.0,4,46966.3,70449.4,0.908153,38.7,22.3,55.0,26.0,0.0,21


# Feature Selection

## Pearson Correlation
Filtering Method 1

In [5]:
X = df.loc[:, df.columns!='blister']
y = df['blister']

In [6]:
X.shape

(54221, 16)

In [7]:
num_feats = 10

In [8]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X,y,num_feats)
print(str(len(cor_feature)), 'selected features')
print(cor_feature)

10 selected features
['Avg Dew Point', 'booth', 'time_description', 'iso_press_avg', 'poly_press_avg', 'model', 'shell_color', 'iso_lbs_run_usage', 'serial#', 'poly_lbs_run_usage']


## Chi Squared 
Filtering Method 2

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
print(chi_feature)

10 selected features
['serial#', 'shell_color', 'model', 'time_description', 'iso_press_avg', 'poly_press_avg', 'booth', 'iso_lbs_run_usage', 'poly_lbs_run_usage', 'Avg Temp']


## Recursive Feature Elimination
Wrapper Method 1

In [10]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

Fitting estimator with 16 features.
10 selected features
['serial#', 'shell_color', 'model', 'time_description', 'iso_press_avg', 'poly_press_avg', 'iso_lbs_run_usage', 'poly_lbs_run_usage', 'Avg Temp', 'Avg Humidity']


## Lasso: SelectFromModel

Embedded Method 1

In [11]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
print(embeded_lr_feature)

6 selected features
['serial#', 'shell_color', 'model', 'iso_press_avg', 'poly_press_avg', 'poly_lbs_run_usage']


## Tree-based: SelectFromModel

Embedded Method 2

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
print(embeded_rf_feature)

6 selected features
['serial#', 'iso_press_avg', 'poly_press_avg', 'iso_lbs_run_usage', 'poly_lbs_run_usage', 'spray_time']


## LightGBM: SelectFromModel

Embedded Method 3

In [13]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(X, y)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')
print(embeded_lgb_feature)

10 selected features
['serial#', 'model', 'iso_press_avg', 'poly_press_avg', 'iso_lbs_run_usage', 'poly_lbs_run_usage', 'spray_time', 'Avg Temp', 'Avg Dew Point', 'Avg Humidity']


# Narrowing Selection

Based on the above feature selection methods, below are the features that we will be using to narrow down our search.

1. Model
2. iso_press_avg
3. poly_press_avg
4. iso_lbs_run_usage
5. poly_lbs_run_usage
6. shell_color
7. Avg Temp
8. Avg Humidity
9. Time Description
10. Booth


In [14]:
X = df[['blister','model','shell_color','time_description','iso_press_avg','poly_press_avg','booth','iso_lbs_run_usage','poly_lbs_run_usage','Avg Temp','Avg Humidity']]

## Evaluating Collinearity

Based on correlation matrix and VIF scores below, here are the features we will keep:
1. Model
2. Shell_Color
3. poly_press_avg
4. avg_temp
5. poly_lbs_run_usage
6. time_description
7. booth

In [15]:
#plot color scaled correlation matrix
corr=X.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,blister,model,shell_color,time_description,iso_press_avg,poly_press_avg,booth,iso_lbs_run_usage,poly_lbs_run_usage,Avg Temp,Avg Humidity
blister,1.0,0.032208,0.040647,-0.010672,-0.014878,-0.025377,0.006648,0.043235,0.044318,-0.003374,-0.001523
model,0.032208,1.0,0.104691,0.029223,0.003097,0.002619,0.100354,-0.000276,0.001013,0.006804,0.002818
shell_color,0.040647,0.104691,1.0,0.028465,-0.001396,-0.00227,-0.013045,0.008478,0.007973,-0.012263,0.009557
time_description,-0.010672,0.029223,0.028465,1.0,0.023488,0.082817,-0.056728,-0.081536,-0.081937,0.057693,-0.060142
iso_press_avg,-0.014878,0.003097,-0.001396,0.023488,1.0,0.811973,-0.02046,-0.106959,-0.113681,0.126229,-0.164347
poly_press_avg,-0.025377,0.002619,-0.00227,0.082817,0.811973,1.0,0.062735,-0.217379,-0.22438,0.144861,-0.183096
booth,0.006648,0.100354,-0.013045,-0.056728,-0.02046,0.062735,1.0,0.071519,0.074414,-0.031494,0.022577
iso_lbs_run_usage,0.043235,-0.000276,0.008478,-0.081536,-0.106959,-0.217379,0.071519,1.0,0.997673,-0.262125,0.120636
poly_lbs_run_usage,0.044318,0.001013,0.007973,-0.081937,-0.113681,-0.22438,0.074414,0.997673,1.0,-0.261385,0.118481
Avg Temp,-0.003374,0.006804,-0.012263,0.057693,0.126229,0.144861,-0.031494,-0.262125,-0.261385,1.0,-0.83727


In [16]:
#Compute VIF data for each independent variable
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["features"] =X.columns
vif["vif_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif

Unnamed: 0,features,vif_Factor
0,blister,1.013462
1,model,3.86233
2,shell_color,2.625129
3,time_description,2.870531
4,iso_press_avg,131.770818
5,poly_press_avg,117.720968
6,booth,6.410716
7,iso_lbs_run_usage,293.706623
8,poly_lbs_run_usage,295.336301
9,Avg Temp,14.745138


In [None]:
X = df[['model','shell_color','time_description','poly_press_avg','booth','Avg Temp']]