# Predict Citation Counts

In [21]:
from collections import OrderedDict
import pickle 
import pandas as pd 
import json
import numpy as np
import sklearn
from pathlib import Path

print("sklearn version:", sklearn.__version__)
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import mean_squared_error

from utils import timed_func

import warnings
warnings.filterwarnings('ignore')

sklearn version: 0.24.1


In [2]:
def fprint(s, outfile="20210502_export.out", end="\n"):
    with open(outfile, "a+") as f:
        f.write(s + end)

In [3]:
df_ai_labeled = pd.read_csv("../data/df_ai_labeled.csv")
print(df_ai_labeled.shape)
df_ai_labeled.head()

(6085, 9)


Unnamed: 0,venue,count,label,NLP,Speech,ML,AI,CV,Robo
0,INTERSPEECH,10952,1.0,False,True,False,False,False,False
1,IJCAI,7456,1.0,False,False,False,True,False,False
2,AAAI,5831,1.0,False,False,False,True,False,False
3,LREC,5245,1.0,True,False,False,False,False,False
4,NIPS,3991,1.0,False,False,True,False,False,False


In [4]:
features = pd.read_csv("../data/features_v2_with_venue.csv")
print(features.shape)
features.head()

(945676, 91)


Unnamed: 0,paper_id,venue,venue_category,venue_is_top,pos_abstract_ADJ,pos_abstract_ADV,pos_abstract_ADP,pos_abstract_AUX,pos_abstract_CCONJ,pos_abstract_DET,...,rst_Manner-Means,rst_Topic-Comment,rst_Summary,rst_Temporal,rst_Topic-Change,rst_textual-organization,rst_same-unit,grammar_errors_abstract,grammar_errors_bodytext,outbound_citations_per_word
0,18981111,Journal of Special Education Technology,Other,0.0,0.17284,0.018519,0.154321,0.0,0.018519,0.154321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,5,88,0.002796
1,18982496,Publications,Other,0.0,0.081481,0.037037,0.162963,0.0,0.02963,0.118519,...,0.125,0.0,0.0,0.0,0.0,0.0,0.0,3,51,0.00708
2,18983391,Canadian Conference on Electrical and Computer...,Other,0.0,0.093567,0.011696,0.157895,0.0,0.023392,0.152047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,2,70,0.001625
3,199668887,RecSys '19,Other,0.0,0.07732,0.056701,0.123711,0.0,0.046392,0.108247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,49,0.009973
4,199668943,ArXiv,Other,0.0,0.25,0.044872,0.108974,0.0,0.044872,0.089744,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,2,87,0.007781


In [5]:
def get_venue_names(option='EMNLP'):
    base_path = "../data/venue_name_labels"
    df = pd.read_csv(Path(base_path, f"{option}.csv"))
    D = {}
    for i, row in df.iterrows():
        D[row.venue] = row.label
    return D


venue_feat = features[features.venue.isin(get_venue_names("EMNLP_v_arxiv"))]

In [23]:
def drop_if_exists(df, cols):
    for c in cols:
        if c in df.columns:
            df = df.drop(columns=c)
    return df 

def regression_select_features(venue_feat, models, verbose=False):
    Y = venue_feat['annual_citations'].values
    df = drop_if_exists(venue_feat, [
        "venue", "venue_category", "paper_id", "venue_is_top",
        "n_citations", "annual_citations"])
    X = df.values

    kf = KFold(n_splits=6)
    importances = []
    
    fold_mse_scores = []
    best_model_names = []
    for trval_idx, test_idx in kf.split(X, Y):
        # Sweep through models in these folds. Choose the best one. Classify
        X_train, X_dev, Y_train, Y_dev = train_test_split(
            X[trval_idx], Y[trval_idx], test_size=0.2
        )
        X_test, Y_test = X[test_idx], Y[test_idx]
        mse_scores = []
        trained_models = []
        for model_name in models:
            model = sklearn.base.clone(models[model_name])
            try:
                model.fit(X_train, Y_train)
                Y_dev_pred = model.predict(X_dev)
                mse_scores.append(mean_squared_error(Y_dev, Y_dev_pred))
            except ValueError:
                mse_scores.append(np.inf)
            trained_models.append(model)
            
        best_id = np.argmin(mse_scores)
        model = trained_models[best_id]
        Y_test_pred = model.predict(X_test)
        fold_mse_scores.append(mean_squared_error(Y_test, Y_test_pred))
        
        best_model_name = list(models.keys())[best_id]
        best_model_names.append(best_model_name)
    
        # Select features only for the best model
        selector = SelectFromModel(model)
        selector.fit(X[trval_idx], Y[trval_idx])
        if hasattr(model, "coef_"):
            importances.append(np.absolute(model.coef_[0]))
        elif hasattr(model, "feature_importances_"):
            importances.append(model.feature_importances_)
        else:
            pass  # Model doesn't support selecting features
        
    mse_mean, mse_std = np.mean(fold_mse_scores), np.std(fold_mse_scores)
    if len(importances) > 0:
        mean_imp = np.mean(importances, axis=0)
        top_features = np.array(list(df))[np.argsort(-mean_imp)]
        top_features_str = "Top5 feats: " + " ".join(top_features[:5])
    else:
        top_features = None
        top_features_str = ""
        
    if verbose:
        print("MSE: mean {:.4f}, std {:.4f}".format(mse_mean, mse_std))
        print(top_features_str)
        
    fold_mse_scores_str = ", ".join(["{:.4f}".format(fs) for fs in fold_mse_scores])
    bm_str = ", ".join(best_model_names)
    return fold_mse_scores_str, top_features_str, bm_str
    
regression_select_features(venue_feat, models={'test': LinearSVR()}, verbose=True)

MSE: mean 1082653.4803, std 2397760.6833
Top5 feats: pos_abstract_ADJ


('2587.0946, 6919.4243, 6444141.7606, 36287.7303, 93.7762, 5891.0955',
 'Top5 feats: pos_abstract_ADJ',
 'test, test, test, test, test, test')

In [25]:
def select_features_sweep_params(venue_feat, verbose=True):
    models = OrderedDict({
        # Models that can select features:
        "linearsvr_l1_C0.5": LinearSVR(loss="epsilon_insensitive", C=0.5),
        "linearsvr_l1_C1.0": LinearSVR(loss="epsilon_insensitive", C=1.0),
        "linearsvr_l1_C2.0": LinearSVR(loss="epsilon_insensitive", C=2.0),
        "linearsvr_l2_C0.5": LinearSVR(loss="squared_epsilon_insensitive", C=0.5),
        "linearsvr_l2_C1.0": LinearSVR(loss="squared_epsilon_insensitive", C=1.0),
        "linearsvr_l2_C2.0": LinearSVR(loss="squared_epsilon_insensitive", C=2.0),
        "linreg_default": LinearRegression(fit_intercept=True, normalize=False),
        "linreg_normalize": LinearRegression(fit_intercept=True, normalize=True),
        "linreg_nointercept": LinearRegression(fit_intercept=False),  # `normalize` is ignored
        "extratrees_16": ExtraTreesRegressor(n_estimators=16, random_state=0),
        "extratrees_32": ExtraTreesRegressor(n_estimators=32, random_state=0),
        "extratrees_64": ExtraTreesRegressor(n_estimators=64, random_state=0),
        "extratrees_128": ExtraTreesRegressor(n_estimators=128, random_state=0),
        "randomforest_50": RandomForestRegressor(n_estimators=50, random_state=0),
        "randomforest_100": RandomForestRegressor(n_estimators=100, random_state=0),
        "randomforest_200": RandomForestRegressor(n_estimators=200, random_state=0),
        "gb_depth2": GradientBoostingRegressor(max_depth=2), 
        "gb_depth3": GradientBoostingRegressor(max_depth=3), 
        "gb_depth4": GradientBoostingRegressor(max_depth=4),
        "gb_depth5": GradientBoostingRegressor(max_depth=5),
        
        # Models without support to selecting features:
        "mlp_10": MLPRegressor([10]),
        "mlp_20": MLPRegressor([20]),
        "mlp_40": MLPRegressor([40]),
        "mlp_80": MLPRegressor([80]),
        "mlp_10_10": MLPRegressor([10,10]),
        "mlp_20_20": MLPRegressor([20,20]),
        "mlp_20_20": MLPRegressor([40,40])
    })
    
    return regression_select_features(venue_feat, models, verbose)

select_features_sweep_params(venue_feat, verbose=True)

MSE: mean 22778.2270, std 42515.7419
Top5 feats: rst_Cause num_sections sent_lens_abs_var pos_abstract_DET pos_bodytext_DET


('105.0034, 6856.0107, 117619.0763, 7646.7788, 98.5752, 4343.9174',
 'Top5 feats: rst_Cause num_sections sent_lens_abs_var pos_abstract_DET pos_bodytext_DET',
 'extratrees_128, linearsvr_l2_C2.0, linearsvr_l2_C0.5, mlp_40, linearsvr_l1_C0.5, linearsvr_l1_C2.0')

In [26]:
@timed_func
def sweep_params_different_venues(with_arxiv=True, drop_redundant_features=False, verbose=True, fprint_suffix="test"):
    venues = [
        "AAAI", "ACL", "COLING", "CVPR", 
        "EMNLP", "ICML", "ICRA", "IJCAI",
        "NAACL", "NIPS"
    ]
    redundant_features = [
        "num_sections", "bodytext_word_counts", "bodytext_sent_counts",  # Remove article length features
        "lex_mattr_5_abstract", "lex_mattr_20_abstract", "lex_mattr_30_abstract", "lex_mattr_40_abstract",  # Only keep MATTR_10
        "lex_mattr_5_bodytext", "lex_mattr_20_bodytext", "lex_mattr_30_bodytext", "lex_mattr_40_bodytext"
    ]
    for venue in venues:
        if with_arxiv:
            venue += "_v_arxiv"
        print ("="*20 + venue + "="*20)
        venue_feat = features[features.venue.isin(get_venue_names(venue))]
        if drop_redundant_features:
            venue_feat = venue_feat.drop(columns=redundant_features)
        scores_str, tp_str, bm_str = select_features_sweep_params(venue_feat, verbose)
        fprint(venue, end=", ")
        fprint(scores_str, end=", ")
        fprint(tp_str, end=", ")
        fprint(bm_str, end=", ")
        fprint(fprint_suffix)
        
sweep_params_different_venues(with_arxiv=True, drop_redundant_features=False, verbose=True, fprint_suffix="with_arxiv_all_features")

MSE: mean 21.3425, std 23.9958
Top5 feats: pos_abstract_VERB bodytext_sent_counts pos_bodytext_NOUN rst_Enablement voice_abstract_passive
MSE: mean 403.8326, std 571.8074
Top5 feats: rst_Cause pos_abstract_DET sent_lens_abs_var num_sections pos_bodytext_DET
MSE: mean 439.8693, std 913.7700
Top5 feats: n_outbound_citations bodytext_word_counts lex_mattr_30_bodytext voice_bodytext_passive surprisal_abstract_std
MSE: mean 15356.5981, std 22378.6217
Top5 feats: rst_Temporal n_author surprisal_bodytext_mean pos_abstract_NUM rst_Attribution
MSE: mean 1331.9019, std 2455.1402
Top5 feats: rst_Cause sent_lens_abs_var pos_abstract_DET num_sections pos_bodytext_DET
MSE: mean 1277.2375, std 1061.0852
Top5 feats: surprisal_abstract_mean lex_mattr_40_bodytext voice_bodytext_active title_word_length voice_abstract_passive
MSE: mean 37.2467, std 15.3532
Top5 feats: voice_bodytext_other n_author surprisal_bodytext_mean lex_mattr_30_bodytext rst_Comparison
MSE: mean 33.1823, std 18.8831
Top5 feats: body

In [27]:
sweep_params_different_venues(with_arxiv=True, drop_redundant_features=True, verbose=True, fprint_suffix="drop_redundant_features")

MSE: mean 29.3817, std 37.2044
Top5 feats: flesch_read_ease_abstract rst_same-unit voice_abstract_other surprisal_bodytext_std sent_lens_bodytext_mean
MSE: mean 396.0324, std 576.7193
Top5 feats: pos_bodytext_DET pos_abstract_DET sent_lens_abs_var pos_bodytext_PROPN rst_Condition
MSE: mean 686.6369, std 973.0085
Top5 feats: abstract_word_counts rst_Cause rst_Comparison surprisal_abstract_std n_outbound_citations
MSE: mean 15267.3467, std 22457.1814
Top5 feats: pos_abstract_ADJ
MSE: mean 56406.0774, std 119414.4576
Top5 feats: sent_lens_bodytext_var rst_Cause rst_Topic-Comment rst_Elaboration sent_lens_abs_var
MSE: mean 1324.2803, std 1166.3895
Top5 feats: lex_mattr_10_bodytext surprisal_abstract_mean surprisal_bodytext_mean n_author rst_Attribution
MSE: mean 38.3930, std 17.0749
Top5 feats: pos_abstract_ADJ flesch_kincaid_grade_level_bodytext flesch_kincaid_grade_level_abstract flesch_read_ease_bodytext flesch_read_ease_abstract
MSE: mean 32.6076, std 20.6907
Top5 feats: sent_lens_body

## Partial Features

In [29]:
with open("../data/features_by_category.json", "r") as f:
    features_by_category = json.loads(f.read())

@timed_func
def sweep_params_partial_features(cat_choices=["surprisal"], with_arxiv=True, fprint_suffix="test"):
    venues = [
        "AAAI", "ACL", "COLING", "CVPR", 
        "EMNLP", "ICML", "ICRA", "IJCAI",
        "NAACL", "NIPS"
    ]
    feat_names = []
    for cc in cat_choices:
        feat_names += features_by_category[cc]
    print ("Feature choices: {}. N. features: {}".format(
        feat_names, len(feat_names)
    ))
    
    for venue in venues:
        if with_arxiv:
            venue += "_v_arxiv"
        print ("\n" + "="*20 + venue + "="*20)
        venue_feat = features[features.venue.isin(get_venue_names(venue))]
        selected_feat = venue_feat[feat_names + ['annual_citations']]
        scores_str, tp_str, bm_str = select_features_sweep_params(selected_feat)
        
        fprint(venue, end=", ")
        fprint(scores_str, end=", ")
        fprint(tp_str, end=", ")
        fprint(bm_str, end=", ")
        fprint(fprint_suffix)
        
sweep_params_partial_features(['pos'], fprint_suffix="partial_features_pos")

Feature choices: ['pos_abstract_ADJ', 'pos_abstract_ADV', 'pos_abstract_ADP', 'pos_abstract_AUX', 'pos_abstract_CCONJ', 'pos_abstract_DET', 'pos_abstract_INTJ', 'pos_abstract_NOUN', 'pos_abstract_NUM', 'pos_abstract_PART', 'pos_abstract_PRON', 'pos_abstract_PROPN', 'pos_abstract_SPACE', 'pos_abstract_VERB', 'pos_bodytext_ADJ', 'pos_bodytext_ADV', 'pos_bodytext_ADP', 'pos_bodytext_AUX', 'pos_bodytext_CCONJ', 'pos_bodytext_DET', 'pos_bodytext_INTJ', 'pos_bodytext_NOUN', 'pos_bodytext_NUM', 'pos_bodytext_PART', 'pos_bodytext_PRON', 'pos_bodytext_PROPN', 'pos_bodytext_SPACE', 'pos_bodytext_VERB']. N. features: 28

MSE: mean 21.0169, std 24.8617
Top5 feats: pos_abstract_ADJ

MSE: mean 391.8178, std 584.3096
Top5 feats: pos_abstract_ADJ

MSE: mean 439.0154, std 918.5922
Top5 feats: pos_abstract_DET pos_bodytext_PRON pos_bodytext_DET pos_abstract_PRON pos_bodytext_VERB

MSE: mean 15175.0688, std 22419.9224
Top5 feats: pos_abstract_ADJ

MSE: mean 1193.4425, std 2545.5859


MSE: mean 1295.0539,

In [30]:
sweep_params_partial_features(['rst'], fprint_suffix="partial_features_rst")

Feature choices: ['rst_Attribution', 'rst_Background', 'rst_Cause', 'rst_Comparison', 'rst_Condition', 'rst_Contrast', 'rst_Elaboration', 'rst_Enablement', 'rst_Evaluation', 'rst_Explanation', 'rst_Joint', 'rst_Manner-Means', 'rst_Topic-Comment', 'rst_Summary', 'rst_Temporal', 'rst_Topic-Change', 'rst_textual-organization', 'rst_same-unit']. N. features: 18

MSE: mean 20.8641, std 24.2725
Top5 feats: rst_Attribution

MSE: mean 390.2361, std 580.3999


MSE: mean 441.2745, std 916.4248
Top5 feats: rst_Elaboration rst_Joint rst_same-unit rst_Attribution rst_Enablement

MSE: mean 15513.2762, std 22476.9671
Top5 feats: rst_Attribution rst_Condition rst_Enablement rst_Elaboration rst_Joint

MSE: mean 1202.7144, std 2533.6052
Top5 feats: rst_Contrast rst_Attribution rst_Elaboration rst_same-unit rst_Joint

MSE: mean 1286.0802, std 1088.8360
Top5 feats: rst_Attribution

MSE: mean 39.4973, std 14.1439
Top5 feats: rst_Enablement rst_Elaboration rst_Attribution rst_Joint rst_same-unit

MSE: mean 

In [31]:
sweep_params_partial_features(['surprisal'], fprint_suffix="partial_features_surprisal")

Feature choices: ['surprisal_abstract_mean', 'surprisal_abstract_std', 'surprisal_bodytext_mean', 'surprisal_bodytext_std']. N. features: 4

MSE: mean 21.4659, std 24.9175
Top5 feats: surprisal_abstract_mean surprisal_bodytext_std surprisal_bodytext_mean surprisal_abstract_std

MSE: mean 389.8730, std 580.6669


MSE: mean 437.8171, std 918.8062
Top5 feats: surprisal_abstract_mean

MSE: mean 15283.0747, std 22600.0870
Top5 feats: surprisal_abstract_mean

MSE: mean 1199.2562, std 2546.5137
Top5 feats: surprisal_abstract_mean

MSE: mean 1316.7482, std 1052.0891
Top5 feats: surprisal_abstract_mean surprisal_bodytext_std surprisal_abstract_std surprisal_bodytext_mean

MSE: mean 37.1593, std 15.8889
Top5 feats: surprisal_abstract_mean

MSE: mean 24.5820, std 22.3632
Top5 feats: surprisal_bodytext_mean surprisal_abstract_std surprisal_abstract_mean surprisal_bodytext_std

MSE: mean 421.1677, std 781.3082
Top5 feats: surprisal_abstract_mean

MSE: mean 3312.3607, std 4762.3261
Top5 feats: surpr

In [32]:
sweep_params_partial_features(['grammar'], fprint_suffix="partial_features_grammar")

Feature choices: ['grammar_errors_abstract', 'grammar_errors_bodytext']. N. features: 2

MSE: mean 21.0450, std 24.1410
Top5 feats: grammar_errors_abstract

MSE: mean 389.8601, std 580.2091
Top5 feats: grammar_errors_abstract

MSE: mean 438.0853, std 918.7083
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

MSE: mean 15280.3519, std 22553.3193
Top5 feats: grammar_errors_abstract

MSE: mean 1236.6575, std 2541.5492
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

MSE: mean 1288.9254, std 1102.2049
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

MSE: mean 37.5738, std 16.7722
Top5 feats: grammar_errors_abstract

MSE: mean 24.9714, std 21.9206
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

MSE: mean 421.3301, std 780.3106
Top5 feats: grammar_errors_bodytext grammar_errors_abstract

MSE: mean 3345.8005, std 4781.7157
Top5 feats: grammar_errors_bodytext grammar_errors_abstract
sweep_params_partial_features done in 82.41 seconds (0.02 hours)

In [33]:
sweep_params_partial_features(['readability'], fprint_suffix="partial_features_readability")

Feature choices: ['flesch_read_ease_abstract', 'flesch_read_ease_bodytext', 'flesch_kincaid_grade_level_abstract', 'flesch_kincaid_grade_level_bodytext']. N. features: 4

MSE: mean 21.3832, std 24.7585
Top5 feats: flesch_read_ease_abstract flesch_read_ease_bodytext flesch_kincaid_grade_level_abstract flesch_kincaid_grade_level_bodytext

MSE: mean 392.6505, std 579.4162
Top5 feats: flesch_read_ease_bodytext flesch_kincaid_grade_level_bodytext flesch_read_ease_abstract flesch_kincaid_grade_level_abstract

MSE: mean 805.9401, std 1097.2997
Top5 feats: flesch_read_ease_bodytext flesch_kincaid_grade_level_abstract flesch_read_ease_abstract flesch_kincaid_grade_level_bodytext

MSE: mean 15285.8080, std 22554.6963
Top5 feats: flesch_read_ease_abstract

MSE: mean 1653.2471, std 2542.6305
Top5 feats: flesch_read_ease_bodytext flesch_kincaid_grade_level_bodytext flesch_read_ease_abstract flesch_kincaid_grade_level_abstract

MSE: mean 1366.1214, std 1046.4758
Top5 feats: flesch_read_ease_bodytext

In [34]:
sweep_params_partial_features(['mattr'], fprint_suffix="partial_features_mattr")

Feature choices: ['lex_mattr_5_abstract', 'lex_mattr_10_abstract', 'lex_mattr_20_abstract', 'lex_mattr_30_abstract', 'lex_mattr_40_abstract', 'lex_mattr_5_bodytext', 'lex_mattr_10_bodytext', 'lex_mattr_20_bodytext', 'lex_mattr_30_bodytext', 'lex_mattr_40_bodytext']. N. features: 10

MSE: mean 21.9378, std 24.1979
Top5 feats: lex_mattr_40_abstract lex_mattr_30_abstract lex_mattr_40_bodytext lex_mattr_20_abstract lex_mattr_10_abstract

MSE: mean 390.1041, std 580.9708
Top5 feats: lex_mattr_5_abstract

MSE: mean 438.3863, std 918.1916
Top5 feats: lex_mattr_30_bodytext lex_mattr_5_abstract lex_mattr_5_bodytext lex_mattr_20_bodytext lex_mattr_20_abstract

MSE: mean 15295.6964, std 22559.7241
Top5 feats: lex_mattr_30_bodytext lex_mattr_20_bodytext lex_mattr_5_abstract lex_mattr_40_bodytext lex_mattr_10_abstract

MSE: mean 1206.7082, std 2541.3458
Top5 feats: lex_mattr_10_abstract lex_mattr_40_bodytext lex_mattr_5_bodytext lex_mattr_20_bodytext lex_mattr_10_bodytext

MSE: mean 2267.4270, std 

In [35]:
sweep_params_partial_features(['voice'], fprint_suffix="partial_features_voice")

Feature choices: ['voice_abstract_active', 'voice_abstract_passive', 'voice_abstract_other', 'voice_bodytext_active', 'voice_bodytext_passive', 'voice_bodytext_other']. N. features: 6

MSE: mean 20.8931, std 24.3019
Top5 feats: voice_abstract_active

MSE: mean 389.2333, std 579.2111
Top5 feats: voice_abstract_active

MSE: mean 786.9370, std 862.7189
Top5 feats: voice_bodytext_other voice_bodytext_passive voice_bodytext_active voice_abstract_active voice_abstract_other

MSE: mean 15277.9055, std 22583.4493
Top5 feats: voice_bodytext_other voice_bodytext_passive voice_bodytext_active voice_abstract_active voice_abstract_passive

MSE: mean 1351.9420, std 2490.7462
Top5 feats: voice_bodytext_active voice_bodytext_other voice_bodytext_passive voice_abstract_active voice_abstract_passive

MSE: mean 1303.6088, std 1099.3045
Top5 feats: voice_bodytext_passive voice_bodytext_other voice_bodytext_active voice_abstract_active voice_abstract_passive

MSE: mean 38.8348, std 15.1706
Top5 feats: voic

In [36]:
sweep_params_partial_features(['surprisal', 'readability'], fprint_suffix="partial_features_surprisal_readability")

Feature choices: ['surprisal_abstract_mean', 'surprisal_abstract_std', 'surprisal_bodytext_mean', 'surprisal_bodytext_std', 'flesch_read_ease_abstract', 'flesch_read_ease_bodytext', 'flesch_kincaid_grade_level_abstract', 'flesch_kincaid_grade_level_bodytext']. N. features: 8

MSE: mean 30.0433, std 27.7415
Top5 feats: surprisal_abstract_mean surprisal_bodytext_std surprisal_abstract_std flesch_kincaid_grade_level_abstract flesch_kincaid_grade_level_bodytext

MSE: mean 390.0196, std 579.5319
Top5 feats: flesch_kincaid_grade_level_bodytext surprisal_abstract_std flesch_read_ease_bodytext surprisal_bodytext_mean surprisal_bodytext_std

MSE: mean 438.5706, std 916.8959
Top5 feats: surprisal_abstract_mean surprisal_abstract_std flesch_kincaid_grade_level_bodytext surprisal_bodytext_std flesch_read_ease_bodytext

MSE: mean 15278.1649, std 22528.1401
Top5 feats: surprisal_abstract_mean

MSE: mean 1204.6652, std 2542.3267
Top5 feats: surprisal_bodytext_mean surprisal_abstract_std flesch_read_e

# Baseline: Mean of Train Data Labels

In [43]:
def baseline_regression(venue_feat, verbose=True):
    Y = venue_feat['annual_citations'].values
    df = drop_if_exists(venue_feat, [
        "venue", "venue_category", "paper_id", "venue_is_top",
        "n_citations", "annual_citations"])
    X = df.values

    fold_mse_scores = []
    kf = KFold(n_splits=6)
    for train_idx, test_idx in kf.split(X, Y):
        Ytest = Y[test_idx]
        Ypred = [Y[train_idx].mean()] * len(Ytest)
        fold_mse_scores.append("{:.4f}".format(mean_squared_error(Ytest, Ypred)))
    scores_str = ", ".join(fold_mse_scores)
    top_features_str = ""
    best_models_str = " , , , , , "
    return scores_str, top_features_str, best_models_str
        
    
def run_baseline_on_venues(features, with_arxiv=True, verbose=True):
    venues = [
        "AAAI", "ACL", "COLING", "CVPR", 
        "EMNLP", "ICML", "ICRA", "IJCAI",
        "NAACL", "NIPS"
    ]
    
    for venue in venues:
        if with_arxiv:
            venue += "_v_arxiv"
        print ("="*20 + venue + "="*20)
        venue_feat = features[features.venue.isin(get_venue_names(venue))]
        
        scores_str, tp_str, bm_str = baseline_regression(venue_feat, verbose)
        fprint(venue, end=", ")
        fprint(scores_str, end=", ")
        fprint(tp_str, end=", ")
        fprint(bm_str, end=", ")
        fprint("Baseline")
        
run_baseline_on_venues(features)



# Tf-Idf Features

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
with open("../data/text_classify_articles_with_arxiv.pkl", "rb") as f:
    data = pickle.load(f)

In [45]:
for conf in data.keys():
    conf_papers = len(data[conf])
    num_main = len([x for x in data[conf] if x['label'] == 1])
    num_workshop = len([x for x in data[conf] if x['label'] == 0])
    print(conf, conf_papers, num_main, num_workshop)

AAAI 466 430 36
ACL 2543 1920 623
COLING 1761 1412 349
CVPR 3683 2609 1074
EMNLP 552 442 110
ICML 466 459 7
ICRA 734 691 43
IJCAI 674 468 206
NAACL 2245 1091 1154
NIPS 960 415 545


In [55]:
def process_vectorize(data, key="EMNLP", abstract=True, bodytext=True, max_features=1000):
    vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=max_features)
    texts = []
    labels = []
    for article in data[key]:
        article_text = []
        if abstract:
            article_text.append(article['abstract'])
        if bodytext:
            article_text.extend(article['bodytext'])
        texts.append(" ".join(article_text))
        
        labels.append(article['label'])
    X = vectorizer.fit_transform(texts).toarray()
    
    venue_tfidf_feats = {'annual_citations': labels}
    for j, name in enumerate(vectorizer.get_feature_names()):
        venue_tfidf_feats[name] = X[:, j]
    return pd.DataFrame(venue_tfidf_feats)

venue_tfidf_feats = process_vectorize(data, 'EMNLP')
print(venue_tfidf_feats.shape)
venue_tfidf_feats.head()

(552, 1001)


Unnamed: 0,annual_citations,000,10,100,11,1993,1994,1995,1996,1997,...,word in,wordnet,words,words and,words are,words in,work,would,would be,zero
0,1,0.0,0.006397,0.0,0.0,0.004563,0.0,0.0,0.0,0.0,...,0.004832,0.0,0.015844,0.0,0.009784,0.007658,0.024601,0.024373,0.016478,0.0
1,1,0.0,0.014198,0.00407,0.004408,0.0,0.008952,0.012042,0.007955,0.007556,...,0.0,0.0,0.007033,0.0,0.008685,0.0,0.004853,0.002705,0.0,0.022266
2,1,0.0,0.002943,0.0,0.0,0.004199,0.0,0.0,0.008245,0.003916,...,0.0,0.016544,0.00243,0.0,0.0,0.0,0.015091,0.002803,0.00379,0.0
3,0,0.052022,0.009876,0.009437,0.0,0.004697,0.005189,0.0,0.004611,0.008759,...,0.0,0.0,0.03533,0.0,0.010069,0.003941,0.002813,0.006271,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.010179,0.003749,0.010085,0.0,0.003164,...,0.0,0.0,0.001963,0.0,0.0,0.0,0.002032,0.0,0.0,0.0


In [None]:
def sweep_conditions():
    conditions = OrderedDict({
        "tfidf_abstract_bodytext_100": {"abstract": True, "bodytext": True, "max_features": 100},
        #"tfidf_abstract_bodytext_300": {"abstract": True, "bodytext": True, "max_features": 300},
        #"tfidf_abstract_bodytext_1k": {"abstract": True, "bodytext": True, "max_features": 1000},
        #"tfidf_abstract_bodytext_3k": {"abstract": True, "bodytext": True, "max_features": 3000},
        "tfidf_abstract_100": {"abstract": True, "bodytext": False, "max_features": 100},
        #"tfidf_abstract_300": {"abstract": True, "bodytext": False, "max_features": 300},
        #"tfidf_abstract_1k": {"abstract": True, "bodytext": False, "max_features": 1000},
        "tfidf_bodytext_100": {"abstract": False, "bodytext": True, "max_features": 100},
        #"tfidf_bodytext_300": {"abstract": False, "bodytext": True, "max_features": 300},
        #"tfidf_bodytext_1k": {"abstract": False, "bodytext": True, "max_features": 1000},
    })
    for cond_name in conditions:
        print("="*20 + cond_name + "="*20)
        for key in data:
            print(key)
            venue_tfidf_feats = process_vectorize(data, key, **conditions[cond_name])
            mse_scores_str, top_feat_str, bm_str = select_features_sweep_params(venue_tfidf_feats)
            fprint(key+"_v_arxiv", end=", ")
            fprint(mse_scores_str, end=", ")
            fprint(top_feat_str, end=", ")
            fprint(bm_str, end=", ")
            fprint(cond_name)
            
sweep_conditions()

AAAI
MSE: mean 0.0673, std 0.0173
Top5 feats: agent
ACL
MSE: mean 0.1542, std 0.0101
Top5 feats: was where we et corpus
COLING
MSE: mean 0.1455, std 0.0147
Top5 feats: for data have language it
CVPR
MSE: mean 0.1675, std 0.0065
Top5 feats: we et our used where
EMNLP
MSE: mean 0.1496, std 0.0187
Top5 feats: than al where or text
ICML
MSE: mean 0.0175, std 0.0117
Top5 feats: of between space distribution than
ICRA
MSE: mean 0.0560, std 0.0208
Top5 feats: algorithm results fig that the each
IJCAI
MSE: mean 0.1592, std 0.0114
Top5 feats: al et al et where function
NAACL
MSE: mean 0.2207, std 0.0072
Top5 feats: task that than set different
NIPS
MSE: mean 0.2009, std 0.0132
Top5 feats: et al et al performance section
AAAI
MSE: mean 0.0747, std 0.0090
Top5 feats: how agent based are agents
ACL
MSE: mean 0.1654, std 0.0084
Top5 feats: the show system propose of
COLING
MSE: mean 0.1572, std 0.0098
Top5 feats: languages data by language be
CVPR
MSE: mean 0.1890, std 0.0073
Top5 feats: we our bas