From b2829cdef1404ed7b5fc9a51bcf1ce4da41595c3 Mon Sep 17 00:00:00 2001 From: ferran Date: Tue, 9 Feb 2021 16:37:06 +0000 Subject: [PATCH] fix int32 and included overwrite flag --- README.md | 11 +++--- pk_classifier/bootstrap.py | 32 +++++++++++++++++- scripts/bootstrap_bow.py | 69 ++++++++++++++++---------------------- scripts/bootstrap_dist.py | 50 +++++++++++---------------- 4 files changed, 84 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index 898d6f6..70ffeb9 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,6 @@ This repository contains custom pipes and models to classify scientific publicat The labels assigned to each publication in the training and test sets are available in CSV format at the [labels folder](https://github.com/fgh95/PKDocClassifier/tree/master/data/labels). We also provide the textual fields from each publication after being parsed at the [subsets folder](https://github.com/fgh95/PKDocClassifier/tree/master/data/subsets). - - ## Reproduce our results ### 1. Installing dependencies @@ -73,7 +71,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc --input-dir data/encoded/fields \ --output-dir data/results/fields \ --output-dir-bootstrap data/results/fields/bootstrap \ - --path-labels data/labels/dev_data.csv + --path-labels data/labels/dev_data.csv \ + --overwrite True ```` 3. Bootstrap n-grams (~3h on 12 threads, requires at least 16GB of RAM) @@ -83,7 +82,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc --input-dir data/encoded/ngrams \ --output-dir data/results/ngrams \ --output-dir-bootstrap data/results/ngrams/bootstrap \ - --path-labels data/labels/dev_data.csv + --path-labels data/labels/dev_data.csv \ + --overwrite True ```` 4. Display results @@ -152,7 +152,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc --output-dir data/results/distributional \ --output-dir-bootstrap data/results/distributional/bootstrap \ --path-labels data/labels/dev_data.csv \ - --path-optimal-bow data/encoded/ngrams/dev_unigrams.parquet + --path-optimal-bow data/encoded/ngrams/dev_unigrams.parquet \ + --overwrite True ```` ```` diff --git a/pk_classifier/bootstrap.py b/pk_classifier/bootstrap.py index a39a6c0..9d3406f 100644 --- a/pk_classifier/bootstrap.py +++ b/pk_classifier/bootstrap.py @@ -1,5 +1,5 @@ import argparse - +from sklearn.model_selection import train_test_split from sklearn.base import BaseEstimator, TransformerMixin from scipy.sparse import csr_matrix import matplotlib.pyplot as plt @@ -92,6 +92,8 @@ def read_in_bow(path_preproc, path_labels): only 2 labels. Returns data as 2 pandas dataframes""" features = pd.read_parquet(path_preproc).sort_values(by=['pmid']).reset_index(drop=True) labs = pd.read_csv(path_labels).sort_values(by=['pmid']).reset_index(drop=True) + features.pmid = features.pmid.astype('int64') + labs.pmid = labs.pmid.astype('int64') assert features['pmid'].equals(labs['pmid']) assert len(labs['label'].unique()) == 2 return features, labs @@ -109,3 +111,31 @@ def str2bool(v): return False else: raise argparse.ArgumentTypeError('Boolean value expected.') + + +def make_ids_per_test(inp_df: pd.DataFrame): + ids_per_test = pd.DataFrame(inp_df['pmid'], columns=['pmid']) + ids_per_test['Dataset'] = inp_df['Dataset'] + ids_per_test['Real label'] = inp_df.label + ids_per_test['times_correct'] = 0 + ids_per_test['times_test'] = 0 + return ids_per_test + + +def split_train_val_test(features, labels, test_size, seed): + x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(features, + labels['label'], + labels['pmid'], + test_size=test_size, + shuffle=True, + random_state=seed, + stratify=labels['label']) + new_per = len(y_val) / len(y_train) + x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train, + y_train, + pmids_train, + test_size=new_per, + shuffle=True, + random_state=seed, + stratify=y_train) + return x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test diff --git a/scripts/bootstrap_bow.py b/scripts/bootstrap_bow.py index 0be150d..14a7a50 100644 --- a/scripts/bootstrap_bow.py +++ b/scripts/bootstrap_bow.py @@ -4,23 +4,18 @@ from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.metrics import precision_recall_fscore_support -from sklearn.model_selection import train_test_split import xgboost as xgb from tqdm import tqdm import argparse -from pk_classifier.bootstrap import Tokenizer, TextSelector, f1_eval, plot_it, update, read_in_bow +from pk_classifier.bootstrap import Tokenizer, TextSelector, f1_eval, plot_it, update, read_in_bow, make_ids_per_test, \ + split_train_val_test def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figure, out_path_bootstrap): all_features, all_labs = read_in_bow(input_tuple[0], input_tuple[1]) + ids_per_test = make_ids_per_test(inp_df=all_labs) all_metrics_test = [] - ids_per_test = pd.DataFrame(all_labs['pmid'], columns=['pmid']) - ids_per_test['Dataset'] = all_labs['Dataset'] - ids_per_test['Real label'] = all_labs.label - ids_per_test['times_correct'] = 0 - ids_per_test['times_test'] = 0 - optimal_epochs = [] median_optimal_epochs = [] median_f1s = [] @@ -33,21 +28,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu # Make splits: 60% train, 20% validation, 20% temp test # ====================================================================================================== - x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(all_features, - all_labs['label'], - all_labs['pmid'], - test_size=per, - shuffle=True, - random_state=rd_seed, - stratify=all_labs['label']) - new_per = len(y_val) / len(y_train) - x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train, - y_train, - pmids_train, - test_size=new_per, - shuffle=True, - random_state=rd_seed, - stratify=y_train) + x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test = \ + split_train_val_test(features=all_features, labels=all_labs, test_size=per, seed=rd_seed) # ===================================================================================================== # Decide max number of iterations using early stopping criteria on the validation set @@ -64,7 +46,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu scale_pos_weight=balancing_factor, nthread=-1) # Define encoding pipeline - EncPip = Pipeline([ + enc_pip = Pipeline([ ('encoder', FeatureUnion(transformer_list=[ ('bow', Pipeline([ ('selector', TextSelector('BoW_Ready', emb=False)), @@ -75,8 +57,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu ])) ]) - x_train_features = EncPip.fit_transform(x_train) - x_val_features = EncPip.transform(x_val) + x_train_features = enc_pip.fit_transform(x_train) + x_val_features = enc_pip.transform(x_val) if a == 0: print("Using: ", x_train_features.shape[1], "features") a = 1 @@ -96,7 +78,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu # Apply predictions to the temp test set # ====================================================================================================== - x_test_encoded = EncPip.transform(x_test) + x_test_encoded = enc_pip.transform(x_test) pred_test = decoder.predict(x_test_encoded) test_results = pd.DataFrame(pred_test == y_test.values, columns=['Result']) test_results['Result'] = test_results['Result'].astype(int) @@ -130,18 +112,20 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu ids_per_val.to_csv(out_path_bootstrap) -def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str): +def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str, overwrite: bool): if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) if not os.path.isdir(output_dir_bootstrap): os.makedirs(output_dir_bootstrap, exist_ok=True) - for inp_file in os.listdir(input_dir): + inp_dev_files = [inp_file for inp_file in os.listdir(input_dir) if 'test' not in inp_file] + + for inp_file in inp_dev_files: inp_path = os.path.join(input_dir, inp_file) experiment_name = inp_file.replace("dev_", "").replace(".parquet", "") print("================== ", experiment_name, "=============================") # Define output - if "res_" + experiment_name + ".csv" not in os.listdir(output_dir): + if "res_" + experiment_name + ".csv" not in os.listdir(output_dir) or overwrite: out_res = os.path.join(output_dir, "res_" + experiment_name + ".csv") out_fig = os.path.join(output_dir_bootstrap, "res_" + experiment_name + ".png") out_dev = os.path.join(output_dir_bootstrap, "bootstrap_" + experiment_name + ".csv") @@ -154,23 +138,26 @@ def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: def main(): parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input-dir", type=str, help="The directory with files containing the encoded " - "documents in parquet format. It will iterate over all " - "files in this directory") + parser.add_argument("--input-dir", type=str, help="The directory with files containing the encoded " + "documents in parquet format. It will iterate over all " + "files in this directory") + + parser.add_argument("--output-dir", type=str, help="Output directory to save the results of each bootstrap " + "iteration in a csv file.") - parser.add_argument("-o", "--output-dir", type=str, help="Output directory to save the results of each bootstrap " - "iteration in a csv file.") + parser.add_argument("--output-dir-bootstrap", type=str, help="Output directory to save the boostrap " + "results as the misclassification " + "rates per document during bootstrap.") - parser.add_argument("-ob", "--output-dir-bootstrap", type=str, help="Output directory to save the boostrap " - "results as the misclassification " - "rates per document during bootstrap.") + parser.add_argument("--path-labels", type=str, help="Path to the csv containing the labels of the training " + "(dev) set") - parser.add_argument("-l", "--path-labels", type=str, help="Path to the csv containing the labels of the training " - "(dev) set") + parser.add_argument("--overwrite", type=bool, help="Whether to overwrite files if results already present in the " + "output directory.", default=False) args = parser.parse_args() run(input_dir=args.input_dir, output_dir=args.output_dir, output_dir_bootstrap=args.output_dir_bootstrap, - path_labels=args.path_labels) + path_labels=args.path_labels, overwrite=args.overwrite) if __name__ == '__main__': diff --git a/scripts/bootstrap_dist.py b/scripts/bootstrap_dist.py index d2a8f14..33be28b 100644 --- a/scripts/bootstrap_dist.py +++ b/scripts/bootstrap_dist.py @@ -4,25 +4,21 @@ from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.metrics import precision_recall_fscore_support -from sklearn.model_selection import train_test_split import xgboost as xgb from tqdm import tqdm import argparse import warnings -from pk_classifier.bootstrap import Tokenizer, TextSelector, plot_it, f1_eval, update, read_in_distributional, str2bool +from pk_classifier.bootstrap import Tokenizer, TextSelector, plot_it, f1_eval, update, read_in_distributional, str2bool, \ + make_ids_per_test, split_train_val_test def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figure, out_path_bootstrap, is_specter, use_bow, path_optimal_bow): all_features, all_labs = read_in_distributional(input_tuple[0], input_tuple[1], is_specter, path_optimal_bow) - all_metrics_test = [] - ids_per_test = pd.DataFrame(all_labs['pmid'], columns=['pmid']) - ids_per_test['Dataset'] = all_labs['Dataset'] - ids_per_test['Real label'] = all_labs.label - ids_per_test['times_correct'] = 0 - ids_per_test['times_test'] = 0 + ids_per_test = make_ids_per_test(inp_df=all_labs) + all_metrics_test = [] optimal_epochs = [] median_optimal_epochs = [] median_f1s = [] @@ -35,21 +31,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu # Make splits: 60% train, 20% validation, 20% temp test # ====================================================================================================== - x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(all_features, - all_labs['label'], - all_labs['pmid'], - test_size=per, - shuffle=True, - random_state=rd_seed, - stratify=all_labs['label']) - new_per = len(y_val) / len(y_train) - x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train, - y_train, - pmids_train, - test_size=new_per, - shuffle=True, - random_state=rd_seed, - stratify=y_train) + x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test = \ + split_train_val_test(features=all_features, labels=all_labs, test_size=per, seed=rd_seed) # ===================================================================================================== # Decide max number of iterations using early stopping criteria on the validation set @@ -67,7 +50,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu if use_bow: # Define encoding pipeline - EncPip = Pipeline([ + enc_pip = Pipeline([ ('encoder', FeatureUnion(transformer_list=[ ('bow', Pipeline([ ('selector', TextSelector('BoW_Ready', emb=False)), @@ -82,7 +65,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu ]) else: - EncPip = Pipeline([ + enc_pip = Pipeline([ ('encoder', FeatureUnion(transformer_list=[ ('abs', Pipeline([ ('selector', TextSelector('embedding', emb=True)) @@ -90,8 +73,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu ])) ]) - x_train_features = EncPip.fit_transform(x_train) - x_val_features = EncPip.transform(x_val) + x_train_features = enc_pip.fit_transform(x_train) + x_val_features = enc_pip.transform(x_val) if a == 0: print("Using: ", x_train_features.shape[1], "features") @@ -111,7 +94,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu # Apply predictions to the temp test set # ====================================================================================================== - x_test_encoded = EncPip.transform(x_test) + x_test_encoded = enc_pip.transform(x_test) pred_test = decoder.predict(x_test_encoded) test_results = pd.DataFrame(pred_test == y_test.values, columns=['Result']) test_results['Result'] = test_results['Result'].astype(int) @@ -146,7 +129,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu def run(is_specter: bool, use_bow: bool, input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str, - path_optimal_bow: str): + path_optimal_bow: str, overwrite: bool): if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) if not os.path.isdir(output_dir_bootstrap): @@ -158,6 +141,8 @@ def run(is_specter: bool, use_bow: bool, input_dir: str, output_dir: str, output inp_files = os.listdir(input_dir) repl = ".parquet" + inp_files = [inp_f for inp_f in inp_files if 'test' not in inp_f] + for inp_file in inp_files: inp_path = os.path.join(input_dir, inp_file) experiment_name = inp_file.replace("dev_", "").replace(repl, "") @@ -165,7 +150,7 @@ def run(is_specter: bool, use_bow: bool, input_dir: str, output_dir: str, output experiment_name = experiment_name + "_bow" print("================== ", experiment_name, "=============================") # Define output - if "res_" + experiment_name + ".csv" not in os.listdir(output_dir): + if "res_" + experiment_name + ".csv" not in os.listdir(output_dir) or overwrite: out_res = os.path.join(output_dir, "res_" + experiment_name + ".csv") out_fig = os.path.join(output_dir_bootstrap, "res_" + experiment_name + ".png") out_dev = os.path.join(output_dir_bootstrap, "bootstrap_" + experiment_name + ".csv") @@ -210,10 +195,13 @@ def main(): parser.add_argument("--path-optimal-bow", type=str, help="Path to the parquet file with the optimal BoW features", default="../data/encoded/ngrams/dev_unigrams.parquet") + parser.add_argument("--overwrite", type=bool, help="Whether to overwrite files if results already present in the " + "output directory.", default=False) + args = parser.parse_args() run(is_specter=args.is_specter, use_bow=args.use_bow, input_dir=args.input_dir, output_dir=args.output_dir, output_dir_bootstrap=args.output_dir_bootstrap, path_labels=args.path_labels, - path_optimal_bow=args.path_optimal_bow) + path_optimal_bow=args.path_optimal_bow, overwrite=args.overwrite) if __name__ == '__main__':