# Classify and Select Features
Try to reach as high classification accuracy as possible on these classification tasks.  
5-fold cross validation. Try many different classification models.  
Also select the most important features for each model.  

In [1]:
from collections import OrderedDict
import pickle 
import pandas as pd 
import json
import numpy as np
import sklearn
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, f1_score

from utils import timed_func

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def fprint(s, outfile="20210423_export.out", end="\n"):
    with open(outfile, "a+") as f:
        f.write(s + end)

In [4]:
df_ai_labeled = pd.read_csv("../data/df_ai_labeled.csv")
print(df_ai_labeled.shape)
df_ai_labeled.head()

(6085, 9)


Unnamed: 0,venue,count,label,NLP,Speech,ML,AI,CV,Robo
0,INTERSPEECH,10952,1.0,False,True,False,False,False,False
1,IJCAI,7456,1.0,False,False,False,True,False,False
2,AAAI,5831,1.0,False,False,False,True,False,False
3,LREC,5245,1.0,True,False,False,False,False,False
4,NIPS,3991,1.0,False,False,True,False,False,False


In [5]:
features = pd.read_csv("../data/features_v2_with_venue.csv")
print(features.shape)
features.head()

(945676, 91)


Unnamed: 0,paper_id,venue,venue_category,venue_is_top,pos_abstract_ADJ,pos_abstract_ADV,pos_abstract_ADP,pos_abstract_AUX,pos_abstract_CCONJ,pos_abstract_DET,...,rst_Manner-Means,rst_Topic-Comment,rst_Summary,rst_Temporal,rst_Topic-Change,rst_textual-organization,rst_same-unit,grammar_errors_abstract,grammar_errors_bodytext,outbound_citations_per_word
0,18981111,Journal of Special Education Technology,Other,0.0,0.17284,0.018519,0.154321,0.0,0.018519,0.154321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,5,88,0.002796
1,18982496,Publications,Other,0.0,0.081481,0.037037,0.162963,0.0,0.02963,0.118519,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,3,51,0.00708
2,18983391,Canadian Conference on Electrical and Computer...,Other,0.0,0.093567,0.011696,0.157895,0.0,0.023392,0.152047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,2,70,0.001625
3,199668887,RecSys '19,Other,0.0,0.07732,0.056701,0.123711,0.0,0.046392,0.108247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,49,0.009973
4,199668943,ArXiv,Other,0.0,0.25,0.044872,0.108974,0.0,0.044872,0.089744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,2,87,0.007781


In [6]:
def get_venue_names(option='EMNLP'):
    base_path = "../data/venue_name_labels"
    df = pd.read_csv(Path(base_path, f"{option}.csv"))
    D = {}
    for i, row in df.iterrows():
        D[row.venue] = row.label
    return D


venue_feat = features[features.venue.isin(get_venue_names("EMNLP_v_arxiv"))]

In [7]:
print(venue_feat.shape)
venue_feat.columns

(714, 91)


Index(['paper_id', 'venue', 'venue_category', 'venue_is_top',
       'pos_abstract_ADJ', 'pos_abstract_ADV', 'pos_abstract_ADP',
       'pos_abstract_AUX', 'pos_abstract_CCONJ', 'pos_abstract_DET',
       'pos_abstract_INTJ', 'pos_abstract_NOUN', 'pos_abstract_NUM',
       'pos_abstract_PART', 'pos_abstract_PRON', 'pos_abstract_PROPN',
       'pos_abstract_SPACE', 'pos_abstract_VERB', 'pos_bodytext_ADJ',
       'pos_bodytext_ADV', 'pos_bodytext_ADP', 'pos_bodytext_AUX',
       'pos_bodytext_CCONJ', 'pos_bodytext_DET', 'pos_bodytext_INTJ',
       'pos_bodytext_NOUN', 'pos_bodytext_NUM', 'pos_bodytext_PART',
       'pos_bodytext_PRON', 'pos_bodytext_PROPN', 'pos_bodytext_SPACE',
       'pos_bodytext_VERB', 'voice_abstract_active', 'voice_abstract_passive',
       'voice_abstract_other', 'voice_bodytext_active',
       'voice_bodytext_passive', 'voice_bodytext_other',
       'surprisal_abstract_mean', 'surprisal_abstract_std',
       'surprisal_bodytext_mean', 'surprisal_bodytext_std', 'n

In [8]:
def drop_if_exists(df, cols):
    for c in cols:
        if c in df.columns:
            df = df.drop(columns=c)
    return df 

def classify_select_features(venue_feat, models, verbose=False):
    Y = venue_feat['venue_is_top'].values
    df = drop_if_exists(venue_feat, [
        "venue", "venue_category", "paper_id", "venue_is_top",
        "n_citations", "annual_citations"])
    X = df.values

    skf = StratifiedKFold(n_splits=6)
    importances = []
    fold_accs = []
    fold_f1_scores = []
    best_model_names = []
    for trval_idx, test_idx in skf.split(X, Y):
        # Sweep through models in these folds. Choose the best one. Classify
        X_train, X_dev, Y_train, Y_dev = train_test_split(
            X[trval_idx], Y[trval_idx], test_size=0.2, stratify=Y[trval_idx]
        )
        X_test, Y_test = X[test_idx], Y[test_idx]
        f1_scores = []
        trained_models = []
        for model_name in models:
            model = sklearn.base.clone(models[model_name])
            try:
                model.fit(X_train, Y_train)
                Y_dev_pred = model.predict(X_dev)
                f1_scores.append(f1_score(Y_dev, Y_dev_pred))
            except ValueError:
                f1_scores.append(0)
            trained_models.append(model)
            
        max_id = np.argmax(f1_scores)
        model = trained_models[max_id]
        Y_test_pred = model.predict(X_test)
        fold_f1_scores.append(f1_score(Y_test, Y_test_pred))
        fold_accs.append(accuracy_score(Y_test, Y_test_pred))
    
        best_model_name = list(models.keys())[max_id]
        best_model_names.append(best_model_name)
    
        # Select features only for the best model
        selector = SelectFromModel(model)
        selector.fit(X[trval_idx], Y[trval_idx])
        if hasattr(model, "coef_"):
            importances.append(np.absolute(model.coef_[0]))
        elif hasattr(model, "feature_importances_"):
            importances.append(model.feature_importances_)
        else:
            pass  # Model doesn't support selecting features
        
    acc_mean, acc_std, f1_mean, f1_std = np.mean(fold_accs), np.std(fold_accs), np.mean(fold_f1_scores), np.std(fold_f1_scores)
    if len(importances) > 0:
        mean_imp = np.mean(importances, axis=0)
        top_features = np.array(list(df))[np.argsort(-mean_imp)]
        top_features_str = "Top5 feats: " + " ".join(top_features[:5])
    else:
        top_features = None
        top_features_str = ""
        
    if verbose:
        print("Acc: mean {:.4f}, std {:.4f}; F1: mean {:.4f}, std {:.4f}".format(
            acc_mean, acc_std, f1_mean, f1_std))
        print(top_features_str)
        
    fold_f1_scores_str = ", ".join(["{:.4f}".format(fs) for fs in fold_f1_scores])
    bm_str = ", ".join(best_model_names)
    return fold_f1_scores_str, top_features_str, bm_str
    
#classify_select_features(venue_feat, models={'gb':GradientBoostingClassifier(), 'mlp': MLPClassifier()}, verbose=True)

In [9]:
def select_features_sweep_params(venue_feat, verbose=True):
    models = OrderedDict({
        # Models that can select features:
        "linearsvc_l2_squared": LinearSVC(loss="squared_hinge", penalty="l2"),
        "linearsvc_l2_hinge": LinearSVC(loss="hinge", penalty="l2"),
        "logreg_default": LogisticRegression(max_iter=100, C=1.0),
        "logreg_maxiter200_C0.5": LogisticRegression(max_iter=200, C=0.5),
        "logreg_maxiter200_C1.0": LogisticRegression(max_iter=200, C=1.0),
        "logreg_maxiter200_C2.0": LogisticRegression(max_iter=200, C=2.0),
        "extratrees_16": ExtraTreesClassifier(n_estimators=16, random_state=0),
        "extratrees_32": ExtraTreesClassifier(n_estimators=32, random_state=0),
        "extratrees_64": ExtraTreesClassifier(n_estimators=64, random_state=0),
        "extratrees_128": ExtraTreesClassifier(n_estimators=128, random_state=0),
        "randomforest_50": RandomForestClassifier(n_estimators=50, random_state=0),
        "randomforest_100": RandomForestClassifier(n_estimators=100, random_state=0),
        "randomforest_200": RandomForestClassifier(n_estimators=200, random_state=0),
        "gb_depth2": GradientBoostingClassifier(max_depth=2), 
        "gb_depth3": GradientBoostingClassifier(max_depth=3), 
        "gb_depth4": GradientBoostingClassifier(max_depth=4),
        "gb_depth5": GradientBoostingClassifier(max_depth=5),
        
        # Models without support to selecting features:
        "mlp_10": MLPClassifier([10]),
        "mlp_20": MLPClassifier([20]),
        "mlp_40": MLPClassifier([40]),
        "mlp_80": MLPClassifier([80]),
        "mlp_10_10": MLPClassifier([10,10]),
        "mlp_20_20": MLPClassifier([20,20]),
        "mlp_20_20": MLPClassifier([40,40])
    })
    
    return classify_select_features(venue_feat, models, verbose)
    
#select_features_sweep_params(venue_feat, verbose=True)

In [11]:
@timed_func
def sweep_params_different_venues(with_arxiv=True, drop_redundant_features=False, verbose=True, fprint_suffix="test"):
    venues = [
        "AAAI", "ACL", "COLING", "CVPR", 
        "EMNLP", "ICML", "ICRA", "IJCAI",
        "NAACL", "NIPS"
    ]
    redundant_features = [
        "num_sections", "bodytext_word_counts", "bodytext_sent_counts",  # Remove article length features
        "lex_mattr_5_abstract", "lex_mattr_20_abstract", "lex_mattr_30_abstract", "lex_mattr_40_abstract",  # Only keep MATTR_10
        "lex_mattr_5_bodytext", "lex_mattr_20_bodytext", "lex_mattr_30_bodytext", "lex_mattr_40_bodytext"
    ]
    for venue in venues:
        if with_arxiv:
            venue += "_v_arxiv"
        print ("="*20 + venue + "="*20)
        venue_feat = features[features.venue.isin(get_venue_names(venue))]
        if drop_redundant_features:
            venue_feat = venue_feat.drop(columns=redundant_features)
        f1_scores_str, tp_str, bm_str = select_features_sweep_params(venue_feat, verbose)
        fprint(venue, end=", ")
        fprint(f1_scores_str, end=", ")
        fprint(tp_str, end=", ")
        fprint(bm_str, end=", ")
        fprint(fprint_suffix)
        
sweep_params_different_venues(with_arxiv=False, drop_redundant_features=False, verbose=True, fprint_suffix="no_arxiv_all_features")

Acc: mean 0.8795, std 0.0078; F1: mean 0.9358, std 0.0044
Top5 feats: flesch_read_ease_bodytext surprisal_abstract_mean n_outbound_citations surprisal_bodytext_mean voice_abstract_active
Acc: mean 0.8195, std 0.0039; F1: mean 0.9000, std 0.0022
Top5 feats: voice_bodytext_passive n_outbound_citations outbound_citations_per_word flesch_kincaid_grade_level_bodytext bodytext_sent_counts
Acc: mean 0.8008, std 0.0048; F1: mean 0.8887, std 0.0034
Top5 feats: num_sections lex_mattr_5_bodytext n_outbound_citations title_word_length outbound_citations_per_word
Acc: mean 1.0000, std 0.0000; F1: mean 1.0000, std 0.0000
Top5 feats: pos_abstract_ADJ flesch_read_ease_abstract lex_mattr_40_bodytext lex_mattr_30_bodytext lex_mattr_20_bodytext
Acc: mean 0.8211, std 0.0151; F1: mean 0.9003, std 0.0089
Top5 feats: grammar_errors_bodytext num_sections n_outbound_citations title_word_length sent_lens_abs_var
Acc: mean 0.9778, std 0.0099; F1: mean 0.9887, std 0.0051
Top5 feats: sent_lens_abs_var flesch_read_

In [11]:
sweep_params_different_venues(with_arxiv=True, drop_redundant_features=False, fprint_suffix="with_arxiv_all_features")

Acc: mean 0.6699, std 0.0499; F1: mean 0.7650, std 0.0339
Top5 feats: n_author num_sections title_word_length n_outbound_citations sent_lens_abs_mean
Acc: mean 0.7712, std 0.0051; F1: mean 0.8688, std 0.0037
Top5 feats: bodytext_sent_counts bodytext_word_counts outbound_citations_per_word n_outbound_citations abstract_word_counts
Acc: mean 0.7220, std 0.0073; F1: mean 0.8356, std 0.0067
Top5 feats: num_sections title_word_length surprisal_abstract_std n_author bodytext_word_counts
Acc: mean 0.8461, std 0.0031; F1: mean 0.9111, std 0.0020
Top5 feats: bodytext_word_counts num_sections bodytext_sent_counts abstract_word_counts grammar_errors_bodytext
Acc: mean 0.6723, std 0.0295; F1: mean 0.7592, std 0.0215
Top5 feats: bodytext_word_counts outbound_citations_per_word grammar_errors_bodytext abstract_word_counts bodytext_sent_counts
Acc: mean 0.7045, std 0.0408; F1: mean 0.6651, std 0.0353
Top5 feats: n_outbound_citations bodytext_word_counts abstract_word_counts outbound_citations_per_wor

### Drop redundant features
Drop the length features

In [12]:
#sweep_params_different_venues(with_arxiv=False, drop_redundant_features=True)

In [12]:
sweep_params_different_venues(with_arxiv=True, drop_redundant_features=True, fprint_suffix="drop_length_features")

Acc: mean 0.6779, std 0.0467; F1: mean 0.7783, std 0.0345
Top5 feats: outbound_citations_per_word n_author voice_bodytext_other surprisal_bodytext_mean grammar_errors_bodytext
Acc: mean 0.7645, std 0.0076; F1: mean 0.8651, std 0.0043
Top5 feats: flesch_kincaid_grade_level_bodytext title_word_length voice_bodytext_active grammar_errors_abstract voice_bodytext_passive
Acc: mean 0.7258, std 0.0049; F1: mean 0.8399, std 0.0046
Top5 feats: surprisal_abstract_std flesch_kincaid_grade_level_bodytext n_author title_word_length surprisal_abstract_mean
Acc: mean 0.8209, std 0.0060; F1: mean 0.8992, std 0.0032
Top5 feats: grammar_errors_bodytext abstract_word_counts surprisal_bodytext_mean n_outbound_citations title_word_length
Acc: mean 0.6485, std 0.0240; F1: mean 0.7456, std 0.0228
Top5 feats: outbound_citations_per_word n_author grammar_errors_abstract abstract_word_counts n_outbound_citations
Acc: mean 0.6524, std 0.0669; F1: mean 0.6519, std 0.0276
Top5 feats: n_outbound_citations abstract_

## Use a subset of writing features

In [14]:
with open("../data/features_by_category.json", "r") as f:
    features_by_category = json.loads(f.read())

@timed_func
def sweep_params_partial_features(cat_choices=["surprisal"], with_arxiv=True, fprint_suffix="test"):
    venues = [
        "AAAI", "ACL", "COLING", "CVPR", 
        "EMNLP", "ICML", "ICRA", "IJCAI",
        "NAACL", "NIPS"
    ]
    feat_names = []
    for cc in cat_choices:
        feat_names += features_by_category[cc]
    print ("Feature choices: {}. N. features: {}".format(
        feat_names, len(feat_names)
    ))
    
    for venue in venues:
        if with_arxiv:
            venue += "_v_arxiv"
        print ("\n" + "="*20 + venue + "="*20)
        venue_feat = features[features.venue.isin(get_venue_names(venue))]
        selected_feat = venue_feat[feat_names + ['venue_is_top']]
        f1_scores_str, tp_str, bm_str = select_features_sweep_params(selected_feat)
        
        fprint(venue, end=", ")
        fprint(f1_scores_str, end=", ")
        fprint(tp_str, end=", ")
        fprint(bm_str, end=", ")
        fprint(fprint_suffix)
        
sweep_params_partial_features(['pos'], fprint_suffix="partial_features_pos")

Feature choices: ['pos_abstract_ADJ', 'pos_abstract_ADV', 'pos_abstract_ADP', 'pos_abstract_AUX', 'pos_abstract_CCONJ', 'pos_abstract_DET', 'pos_abstract_INTJ', 'pos_abstract_NOUN', 'pos_abstract_NUM', 'pos_abstract_PART', 'pos_abstract_PRON', 'pos_abstract_PROPN', 'pos_abstract_SPACE', 'pos_abstract_VERB', 'pos_bodytext_ADJ', 'pos_bodytext_ADV', 'pos_bodytext_ADP', 'pos_bodytext_AUX', 'pos_bodytext_CCONJ', 'pos_bodytext_DET', 'pos_bodytext_INTJ', 'pos_bodytext_NOUN', 'pos_bodytext_NUM', 'pos_bodytext_PART', 'pos_bodytext_PRON', 'pos_bodytext_PROPN', 'pos_bodytext_SPACE', 'pos_bodytext_VERB']. N. features: 28

Acc: mean 0.6330, std 0.0036; F1: mean 0.7688, std 0.0118
Top5 feats: pos_bodytext_NOUN pos_abstract_NOUN pos_abstract_CCONJ pos_bodytext_CCONJ pos_bodytext_PROPN

Acc: mean 0.7659, std 0.0014; F1: mean 0.8672, std 0.0012
Top5 feats: pos_abstract_PROPN pos_bodytext_ADP pos_abstract_ADP pos_bodytext_ADJ pos_bodytext_CCONJ

Acc: mean 0.7269, std 0.0015; F1: mean 0.8418, std 0.0010


In [15]:
sweep_params_partial_features(['rst'], fprint_suffix="partial_features_rst")

Feature choices: ['rst_Attribution', 'rst_Background', 'rst_Cause', 'rst_Comparison', 'rst_Condition', 'rst_Contrast', 'rst_Elaboration', 'rst_Enablement', 'rst_Evaluation', 'rst_Explanation', 'rst_Joint', 'rst_Manner-Means', 'rst_Topic-Comment', 'rst_Summary', 'rst_Temporal', 'rst_Topic-Change', 'rst_textual-organization', 'rst_same-unit']. N. features: 18

Acc: mean 0.6202, std 0.0247; F1: mean 0.7561, std 0.0314
Top5 feats: rst_Contrast rst_Temporal rst_Explanation rst_Manner-Means rst_Elaboration

Acc: mean 0.7669, std 0.0007; F1: mean 0.8680, std 0.0005
Top5 feats: rst_Manner-Means rst_Summary rst_textual-organization rst_Cause rst_Attribution

Acc: mean 0.7263, std 0.0022; F1: mean 0.8413, std 0.0019
Top5 feats: rst_Summary rst_Background rst_Evaluation rst_Attribution rst_Contrast

Acc: mean 0.8069, std 0.0027; F1: mean 0.8931, std 0.0016
Top5 feats: rst_Evaluation rst_same-unit rst_Comparison rst_Cause rst_Background

Acc: mean 0.5966, std 0.0243; F1: mean 0.7395, std 0.0303
To

In [16]:
sweep_params_partial_features(['surprisal'], fprint_suffix="partial_features_surprisal")

Feature choices: ['surprisal_abstract_mean', 'surprisal_abstract_std', 'surprisal_bodytext_mean', 'surprisal_bodytext_std']. N. features: 4

Acc: mean 0.6442, std 0.0192; F1: mean 0.7790, std 0.0079
Top5 feats: surprisal_bodytext_mean surprisal_bodytext_std surprisal_abstract_std surprisal_abstract_mean

Acc: mean 0.7666, std 0.0019; F1: mean 0.8675, std 0.0016
Top5 feats: surprisal_abstract_mean surprisal_bodytext_std surprisal_abstract_std surprisal_bodytext_mean

Acc: mean 0.7274, std 0.0048; F1: mean 0.8417, std 0.0035
Top5 feats: surprisal_bodytext_mean surprisal_abstract_std surprisal_bodytext_std surprisal_abstract_mean

Acc: mean 0.8080, std 0.0006; F1: mean 0.8938, std 0.0004
Top5 feats: surprisal_bodytext_std surprisal_abstract_std surprisal_bodytext_mean surprisal_abstract_mean

Acc: mean 0.6050, std 0.0069; F1: mean 0.7512, std 0.0086
Top5 feats: surprisal_bodytext_mean surprisal_bodytext_std surprisal_abstract_mean surprisal_abstract_std

Acc: mean 0.4479, std 0.0112; F1: 

In [17]:
sweep_params_partial_features(['grammar'], fprint_suffix="partial_features_grammar")

Feature choices: ['grammar_errors_abstract', 'grammar_errors_bodytext']. N. features: 2

Acc: mean 0.6346, std 0.0056; F1: mean 0.7747, std 0.0033
Top5 feats: grammar_errors_abstract grammar_errors_bodytext

Acc: mean 0.7680, std 0.0019; F1: mean 0.8684, std 0.0009
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

Acc: mean 0.7274, std 0.0016; F1: mean 0.8422, std 0.0011
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

Acc: mean 0.8109, std 0.0031; F1: mean 0.8947, std 0.0014
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

Acc: mean 0.6303, std 0.0188; F1: mean 0.7570, std 0.0167
Top5 feats: grammar_errors_abstract grammar_errors_bodytext

Acc: mean 0.5542, std 0.0728; F1: mean 0.6010, std 0.0385
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

Acc: mean 0.9417, std 0.0032; F1: mean 0.9700, std 0.0017
Top5 feats: grammar_errors_abstract grammar_errors_bodytext

Acc: mean 0.6835, std 0.0164; F1: mean 0.7911, std 0.0233
Top5 feats: grammar_

In [18]:
sweep_params_partial_features(['readability'], fprint_suffix="partial_features_readability")

Feature choices: ['flesch_read_ease_abstract', 'flesch_read_ease_bodytext', 'flesch_kincaid_grade_level_abstract', 'flesch_kincaid_grade_level_bodytext']. N. features: 4

Acc: mean 0.6330, std 0.0103; F1: mean 0.7679, std 0.0080
Top5 feats: flesch_kincaid_grade_level_bodytext flesch_kincaid_grade_level_abstract flesch_read_ease_bodytext flesch_read_ease_abstract

Acc: mean 0.7645, std 0.0053; F1: mean 0.8663, std 0.0038
Top5 feats: flesch_kincaid_grade_level_bodytext flesch_kincaid_grade_level_abstract flesch_read_ease_bodytext flesch_read_ease_abstract

Acc: mean 0.7237, std 0.0048; F1: mean 0.8395, std 0.0034
Top5 feats: flesch_kincaid_grade_level_abstract flesch_kincaid_grade_level_bodytext flesch_read_ease_abstract flesch_read_ease_bodytext

Acc: mean 0.8080, std 0.0006; F1: mean 0.8938, std 0.0004
Top5 feats: flesch_kincaid_grade_level_bodytext flesch_kincaid_grade_level_abstract flesch_read_ease_abstract flesch_read_ease_bodytext

Acc: mean 0.6120, std 0.0184; F1: mean 0.7560, st

In [19]:
sweep_params_partial_features(['mattr'], fprint_suffix="partial_features_mattr")

Feature choices: ['lex_mattr_5_abstract', 'lex_mattr_10_abstract', 'lex_mattr_20_abstract', 'lex_mattr_30_abstract', 'lex_mattr_40_abstract', 'lex_mattr_5_bodytext', 'lex_mattr_10_bodytext', 'lex_mattr_20_bodytext', 'lex_mattr_30_bodytext', 'lex_mattr_40_bodytext']. N. features: 10

Acc: mean 0.6138, std 0.0510; F1: mean 0.7526, std 0.0555
Top5 feats: lex_mattr_40_abstract lex_mattr_30_abstract lex_mattr_5_bodytext lex_mattr_40_bodytext lex_mattr_10_bodytext

Acc: mean 0.7666, std 0.0009; F1: mean 0.8678, std 0.0006
Top5 feats: lex_mattr_30_abstract lex_mattr_20_abstract lex_mattr_40_bodytext lex_mattr_40_abstract lex_mattr_5_bodytext

Acc: mean 0.7242, std 0.0081; F1: mean 0.8399, std 0.0057
Top5 feats: lex_mattr_40_bodytext lex_mattr_30_bodytext lex_mattr_5_bodytext lex_mattr_5_abstract lex_mattr_20_bodytext

Acc: mean 0.8080, std 0.0006; F1: mean 0.8938, std 0.0004
Top5 feats: lex_mattr_5_bodytext lex_mattr_30_bodytext lex_mattr_40_bodytext lex_mattr_20_bodytext lex_mattr_5_abstract

In [20]:
sweep_params_partial_features(['voice'], fprint_suffix="partial_features_voice")

Feature choices: ['voice_abstract_active', 'voice_abstract_passive', 'voice_abstract_other', 'voice_bodytext_active', 'voice_bodytext_passive', 'voice_bodytext_other']. N. features: 6

Acc: mean 0.6346, std 0.0111; F1: mean 0.7627, std 0.0182
Top5 feats: voice_bodytext_other voice_bodytext_passive voice_bodytext_active voice_abstract_passive voice_abstract_active

Acc: mean 0.7669, std 0.0007; F1: mean 0.8680, std 0.0005
Top5 feats: voice_bodytext_passive voice_bodytext_active voice_bodytext_other voice_abstract_other voice_abstract_passive

Acc: mean 0.7285, std 0.0034; F1: mean 0.8422, std 0.0011
Top5 feats: voice_bodytext_other voice_bodytext_passive voice_bodytext_active voice_abstract_other voice_abstract_passive

Acc: mean 0.8080, std 0.0019; F1: mean 0.8933, std 0.0015
Top5 feats: voice_bodytext_passive voice_bodytext_active voice_bodytext_other voice_abstract_active voice_abstract_passive

Acc: mean 0.6162, std 0.0143; F1: mean 0.7546, std 0.0137
Top5 feats: voice_bodytext_othe

In [21]:
sweep_params_partial_features(['surprisal', 'readability'], fprint_suffix="partial_features_surprisal_readability")

Feature choices: ['surprisal_abstract_mean', 'surprisal_abstract_std', 'surprisal_bodytext_mean', 'surprisal_bodytext_std', 'flesch_read_ease_abstract', 'flesch_read_ease_bodytext', 'flesch_kincaid_grade_level_abstract', 'flesch_kincaid_grade_level_bodytext']. N. features: 8

Acc: mean 0.6330, std 0.0196; F1: mean 0.7693, std 0.0127
Top5 feats: flesch_kincaid_grade_level_bodytext surprisal_abstract_std surprisal_bodytext_mean surprisal_bodytext_std flesch_kincaid_grade_level_abstract

Acc: mean 0.7655, std 0.0041; F1: mean 0.8669, std 0.0026
Top5 feats: surprisal_bodytext_std surprisal_abstract_mean flesch_kincaid_grade_level_bodytext surprisal_bodytext_mean surprisal_abstract_std

Acc: mean 0.7215, std 0.0100; F1: mean 0.8375, std 0.0071
Top5 feats: flesch_read_ease_abstract surprisal_bodytext_mean surprisal_bodytext_std flesch_kincaid_grade_level_abstract surprisal_abstract_mean

Acc: mean 0.8060, std 0.0032; F1: mean 0.8925, std 0.0021
Top5 feats: surprisal_bodytext_std flesch_kinca