# Classify with Content Features

In [1]:
from collections import OrderedDict
import sklearn
import pickle
import numpy as np
from pathlib import Path
import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, f1_score

from utils import timed_func

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
with open("../data/text_classify_articles_with_arxiv.pkl", "rb") as f:
    data = pickle.load(f)

In [4]:
def fprint(s, outfile="20210425_export.out", end="\n"):
    with open(outfile, "a+") as f:
        f.write(s + end)

In [5]:
for conf in data.keys():
    conf_papers = len(data[conf])
    num_main = len([x for x in data[conf] if x['label'] == 1])
    num_workshop = len([x for x in data[conf] if x['label'] == 0])
    print(conf, conf_papers, num_main, num_workshop)

AAAI 466 430 36
ACL 2543 1920 623
COLING 1761 1412 349
CVPR 3683 2609 1074
EMNLP 552 442 110
ICML 466 459 7
ICRA 734 691 43
IJCAI 674 468 206
NAACL 2245 1091 1154
NIPS 960 415 545


In [6]:
def process_vectorize(data, key="EMNLP", abstract=True, bodytext=True, max_features=1000):
    vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=max_features)
    texts = []
    labels = []
    for article in data[key]:
        article_text = []
        if abstract:
            article_text.append(article['abstract'])
        if bodytext:
            article_text.extend(article['bodytext'])
        texts.append(" ".join(article_text))
        
        labels.append(article['label'])
    X = vectorizer.fit_transform(texts)
    Y = np.array(labels)
    return X, Y, vectorizer.get_feature_names()

X, Y, feature_names = process_vectorize(data, 'EMNLP')
X.shape, Y.shape, feature_names[:10]

((552, 1000),
 (552,),
 ['000', '10', '100', '11', '1993', '1994', '1995', '1996', '1997', '1998'])

In [9]:
def classify_select_models(XYF, models, verbose=True):
    """
    Same training scheme as 20210423_classify_select_features.ipynb
    """
    X, Y, feature_names = XYF
    
    skf = StratifiedKFold(n_splits=6)
    importances = []
    fold_accs = []
    fold_f1_scores = []
    best_model_names = []
    for trval_idx, test_idx in skf.split(X, Y):
        # Sweep through models in these folds. Choose the best one. Classify
        X_train, X_dev, Y_train, Y_dev = train_test_split(
            X[trval_idx], Y[trval_idx], test_size=0.2, stratify=Y[trval_idx]
        )
        X_test, Y_test = X[test_idx], Y[test_idx]
        f1_scores = []
        trained_models = []
        for model_name in models:
            model = sklearn.base.clone(models[model_name])
            try:
                model.fit(X_train, Y_train)
                Y_dev_pred = model.predict(X_dev)
                f1_scores.append(f1_score(Y_dev, Y_dev_pred))
            except ValueError:
                f1_scores.append(0)
            trained_models.append(model)
            
        
            
        max_id = np.argmax(f1_scores)
        model = trained_models[max_id]
        Y_test_pred = model.predict(X_test)
        fold_f1_scores.append(f1_score(Y_test, Y_test_pred))
        fold_accs.append(accuracy_score(Y_test, Y_test_pred))
    
        best_model_name = list(models.keys())[max_id]
        best_model_names.append(best_model_name)
    
        # Select the most important features
        selector = SelectFromModel(model)
        selector.fit(X[trval_idx], Y[trval_idx])
        if hasattr(model, "coef_"):
            importances.append(np.absolute(model.coef_[0]))
        elif hasattr(model, "feature_importances_"):
            importances.append(model.feature_importances_)
        else:
            pass  # Model doesn't support selecting features
        
    acc_mean, acc_std, f1_mean, f1_std = np.mean(fold_accs), np.std(fold_accs), np.mean(fold_f1_scores), np.std(fold_f1_scores)
    if len(importances) > 0:
        mean_imp = np.mean(importances, axis=0)
        top_features = np.array(feature_names)[np.argsort(-mean_imp)]
        top_features_str = "Top5 feats: " + "; ".join(top_features[:5])
    else:
        top_features = None
        top_features_str = ""
    
    if verbose:
        print("Acc: mean {:.4f}, std {:.4f}; F1: mean {:.4f}, std {:.4f}".format(
            acc_mean, acc_std, f1_mean, f1_std))
        print(top_features_str)
        
    fold_f1_scores_str = ", ".join(["{:.4f}".format(fs) for fs in fold_f1_scores])
    bm_str = ", ".join(best_model_names)
    return fold_f1_scores_str, top_features_str, bm_str
   
@timed_func
def select_features_sweep_params(XYF, verbose=True):
    models = OrderedDict({
        # Models that can select features:
        "linearsvc_l2_squared": LinearSVC(loss="squared_hinge", penalty="l2"),
        "linearsvc_l2_hinge": LinearSVC(loss="hinge", penalty="l2"),
        "logreg_default": LogisticRegression(max_iter=100, C=1.0),
        "logreg_maxiter200_C0.5": LogisticRegression(max_iter=200, C=0.5),
        "logreg_maxiter200_C1.0": LogisticRegression(max_iter=200, C=1.0),
        "logreg_maxiter200_C2.0": LogisticRegression(max_iter=200, C=2.0),
        "extratrees_16": ExtraTreesClassifier(n_estimators=16, random_state=0),
        "extratrees_32": ExtraTreesClassifier(n_estimators=32, random_state=0),
        "extratrees_64": ExtraTreesClassifier(n_estimators=64, random_state=0),
        "extratrees_128": ExtraTreesClassifier(n_estimators=128, random_state=0),
        "randomforest_50": RandomForestClassifier(n_estimators=50, random_state=0),
        "randomforest_100": RandomForestClassifier(n_estimators=100, random_state=0),
        "randomforest_200": RandomForestClassifier(n_estimators=200, random_state=0),
        "gb_depth2": GradientBoostingClassifier(max_depth=2), 
        "gb_depth3": GradientBoostingClassifier(max_depth=3), 
        "gb_depth4": GradientBoostingClassifier(max_depth=4),
        "gb_depth5": GradientBoostingClassifier(max_depth=5),
        
        # Models without support to selecting features:
        "mlp_10": MLPClassifier([10]),
        "mlp_20": MLPClassifier([20]),
        "mlp_40": MLPClassifier([40]),
        "mlp_80": MLPClassifier([80]),
        "mlp_10_10": MLPClassifier([10,10]),
        "mlp_20_20": MLPClassifier([20,20]),
        "mlp_20_20": MLPClassifier([40,40])
    })
    
    return classify_select_models(XYF, models, verbose)

#select_features_sweep_params((X, Y, feature_names))

In [11]:
def sweep_conditions():
    conditions = OrderedDict({
        "tfidf_abstract_bodytext_100": {"abstract": True, "bodytext": True, "max_features": 100},
        "tfidf_abstract_bodytext_300": {"abstract": True, "bodytext": True, "max_features": 300},
        "tfidf_abstract_bodytext_1k": {"abstract": True, "bodytext": True, "max_features": 1000},
        "tfidf_abstract_bodytext_3k": {"abstract": True, "bodytext": True, "max_features": 3000},
        "tfidf_abstract_100": {"abstract": True, "bodytext": False, "max_features": 100},
        "tfidf_abstract_300": {"abstract": True, "bodytext": False, "max_features": 300},
        "tfidf_abstract_1k": {"abstract": True, "bodytext": False, "max_features": 1000},
        "tfidf_bodytext_100": {"abstract": False, "bodytext": True, "max_features": 100},
        "tfidf_bodytext_300": {"abstract": False, "bodytext": True, "max_features": 300},
        "tfidf_bodytext_1k": {"abstract": False, "bodytext": True, "max_features": 1000},
    })
    for cond_name in conditions:
        print("="*20 + cond_name + "="*20)
        for key in data:
            print(key)
            X, Y, F = process_vectorize(data, key, **conditions[cond_name])
            f1_scores_str, top_feat_str, bm_str = select_features_sweep_params((X, Y, F))
            fprint(key+"_v_arxiv", end=", ")
            fprint(f1_scores_str, end=", ")
            fprint(top_feat_str, end=", ")
            fprint(bm_str, end=", ")
            fprint(cond_name)
            
sweep_conditions()

AAAI
Acc: mean 0.9249, std 0.0049; F1: mean 0.9609, std 0.0025
Top5 feats: is; are; algorithm; this; for
select_features_sweep_params done in 26.55 seconds (0.01 hours)
ACL
Acc: mean 0.7731, std 0.0178; F1: mean 0.8629, std 0.0079
Top5 feats: et; we; was; where; task
select_features_sweep_params done in 163.96 seconds (0.05 hours)
COLING
Acc: mean 0.7973, std 0.0047; F1: mean 0.8860, std 0.0031
Top5 feats: data; or; method; with; have
select_features_sweep_params done in 103.46 seconds (0.03 hours)
CVPR
Acc: mean 0.7619, std 0.0091; F1: mean 0.8477, std 0.0042
Top5 feats: we; et; our; al; where
select_features_sweep_params done in 251.73 seconds (0.07 hours)
EMNLP
Acc: mean 0.8188, std 0.0136; F1: mean 0.8963, std 0.0078
Top5 feats: where; al; than; section; as
select_features_sweep_params done in 30.35 seconds (0.01 hours)
ICML
Acc: mean 0.9850, std 0.0048; F1: mean 0.9924, std 0.0024
Top5 feats: of; the; each; training; we
select_features_sweep_params done in 17.74 seconds (0.00 hour