In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

### Load the dataset and features to be removed

In [None]:
data = pd.read_csv('../Master.csv')
ground_truth = pkl.load(open("./ground_truth.pkl", 'rb'))
cate = pkl.load(open("./cate.pkl", 'rb'))
others = pkl.load(open("./others.pkl", 'rb'))
columns_with_nan = [ _ for _ in data.columns if data[_].isna().any()]

### Remove ground truth and irrelevant features

In [None]:
def remove_columns(dataframe, columns_to_remove = []):
    columns = dataframe.columns
    columns_to_remove = set(columns).intersection(set(columns_to_remove))    
    print ("REMOVED : {}".format(columns_to_remove))
    if list(columns_to_remove):
        dataframe = dataframe.drop(list(columns_to_remove), axis = 1) 
    return dataframe
rem_feature = ground_truth + cate + columns_with_nan + others
data = remove_columns(data, rem_feature)


In [None]:
# getting all categorical feature
group = data.columns.to_series().groupby(data.dtypes).groups

### Initialize X, y and normalize X

In [None]:
X = remove_columns(data, ['actiontype'])
y = data['actiontype']
X_norm = MinMaxScaler().fit_transform(X)

### Chi-Squared feature selection

In [None]:
num_feats = 400

chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = data.drop('actiontype', axis = 1).loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

# save the selected features from chi-squared feature selection technique
chi2 = pd.DataFrame( data = {'Feature':chi_feature})
chi2.to_csv("./Chi2_action_features.csv")

### RFE feature selection

In [None]:
rfe_selector = RFE(estimator = LogisticRegression(),
                   n_features_to_select = num_feats,
                   step=40,
                   verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = data.drop('actiontype', axis = 1).loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

# save the selected features from RFE feature selection technique
rfe = pd.DataFrame( data ={'Feature':rfe_feature})
rfe.to_csv("./RFE_action_features.csv")

### LightGBM feature selection

In [None]:
lgbc=LGBMClassifier(n_estimators = 500,
                    learning_rate = 0.05,
                    num_leaves = 32,
                    colsample_bytree = 0.2,
                    reg_alpha = 3,
                    reg_lambda = 1,
                    min_split_gain = 0.01,
                    min_child_weight = 40)

embeded_lgb_selector = SelectFromModel(lgbc,
                                       max_features = num_feats)
embeded_lgb_selector.fit(X_norm, y)
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = data.drop('actiontype', axis = 1).loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

# save the selected features from LightGBM feature selection technique
lgb = pd.DataFrame( data ={'Feature':embeded_lgb_feature})
lgb.to_csv("./LightGBM_action_features.csv")

### Load the feature set obtained from the individual feature selection techniques

In [None]:
chi2 = pd.read_csv("./Chi2_action_features.csv")
rfe = pd.read_csv("./RFE_action_features.csv")
lgb = pd.read_csv("./LightGBM_action_features.csv")

chi_feature = chi2['Feature'].to_list()
rfe_feature = rfe['Feature'].to_list()
embeded_lgb_feature = lgb['Feature'].to_list()
combined_features =  chi_feature  + embeded_lgb_feature +  rfe_feature


### Calculating the feature count (Voting) 

In [None]:
features_found_count = {}
for features in set(combined_features):
    features_found_count[features] = total_features.count(features)
    
sorted_features_found_count = {k: v for k, v in sorted(features_found_count.items(), key = lambda item: item[1], reverse=True)}


In [None]:
# save the final feature set
f = open('./final_feats_actiontype.pkl','wb')
pkl.dump(sorted_features_found_count,f)

### Summary of features 

In [None]:
print ("Total number of features selected from all the above method : {}".format(len(combined_features)))
print ("Total number of intersecting features : {}".format(len(set(combined_features))))