In [1]:
'''This code uses Recursive Feature Elimination to rank features to be used in classification'''

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV 
from sklearn import metrics
import numpy as np

In [2]:
# loading the data 
from load import parse_dataset

In [77]:
# Experiment settings

DATASET_FP = "../datasets/train/SemEval2018-T3-train-taskA_emoji.txt"

K_FOLDS = 10 # 10-fold crossvalidation

CLF = LogisticRegression() 

# Create recursive feature eliminator that scores features by mean squared errors

from sklearn.metrics import fbeta_score, make_scorer 
f_scorer = make_scorer(fbeta_score, beta=0.9) 

# This RFECV is defined based on the code output of the original repository
rfecv = RFECV(cv=10, 
              estimator=LogisticRegression(
                  C=1.0, 
                  class_weight=None, 
                  dual=False, 
                  fit_intercept=True,
                  intercept_scaling=1, 
                  max_iter=100, 
                  multi_class='ovr', 
                  n_jobs=1,
                  penalty='l2', 
                  random_state=None, 
                  solver='liblinear', 
                  tol=0.0001,
                  verbose=0, 
                  warm_start=False), 
              n_jobs=1, 
              scoring=make_scorer(fbeta_score, beta=0.9), 
              step=1, 
              verbose=0)
# Previous, less verbose code: RFECV(estimator=CLF, step=1, scoring=f_scorer, cv=K_FOLDS)

# Loading dataset 
corpus, y = parse_dataset(DATASET_FP)

X = np.load(open('./train_feats_taskA.npy', 'rb'))

rfecv.fit(X, y)

  'precision', 'predicted', average, warn_for)


RFECV(cv=10,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring=make_scorer(fbeta_score, beta=0.9), step=1, verbose=0)

In [78]:
print('number of effective features:', rfecv.n_features_)
print('ranking of features:', list(rfecv.ranking_))

number of effective features: 13
ranking of features: [32, 1, 16, 1, 41, 38, 7, 1, 25, 17, 15, 4, 13, 44, 6, 33, 36, 40, 46, 34, 1, 42, 1, 39, 45, 1, 10, 35, 29, 14, 19, 1, 1, 28, 18, 37, 43, 31, 1, 5, 22, 27, 3, 21, 1, 1, 11, 2, 23, 1, 1, 30, 24, 9, 26, 20, 12, 8]


In [79]:
# This part is simply to give information on what features were deemed effective 

feats_name = ['leftIntensity', 'rightIntensity', 'polarityDiff','contrast', 
       '<allcaps>1','<annoyed>1','<censored>1','<date>1','<elongated>1','<emphasis>1',
       '<happy>1','<hashtag>1','<heart>1','<kiss>1','<laugh>1','<money>1','<number>1',
       '<percent>1','<phone>1','<repeated>1','<sad>1','<shocking>1','<surprise>1','<time>1',
       '<tong>1','<url>1','<user>1','<wink>1','<allcaps>2','<annoyed>2','<censored>2',
       '<date>2', '<elongated>2','<emphasis>2','<happy>2','<hashtag>2','<heart>2','<kiss>2',
          '<laugh>2','<money>2', '<number>2','<percent>2','<phone>2','<repeated>2','<sad>2',
          '<shocking>2','<surprise>2','<time>2','<tong>2','<url>2','<user>2','<wink>2', 
          'POS1','NEG1','NEUTRAL1','POS2','NEG2','NEUTRAL2'
              ]        
print("size of feature names", len(feats_name))

good_Features = [feats_name[i] for i in range(len(feats_name)) if rfecv.ranking_[i] in [1]]
good_Features_indices = [i for i in range(len(feats_name)) if rfecv.ranking_[i] in [1]]
print(good_Features)
print(good_Features_indices)

size of feature names 58
['rightIntensity', 'contrast', '<date>1', '<sad>1', '<surprise>1', '<url>1', '<date>2', '<elongated>2', '<laugh>2', '<sad>2', '<shocking>2', '<url>2', '<user>2']
[1, 3, 7, 20, 22, 25, 31, 32, 38, 44, 45, 49, 50]


In [80]:
# save the top indices (related to the best features)
import pickle 
with open('indices', 'wb') as fp:
    pickle.dump(good_Features_indices, fp)