From b2829cdef1404ed7b5fc9a51bcf1ce4da41595c3 Mon Sep 17 00:00:00 2001
From: ferran <ferran-95@hotmail.com>
Date: Tue, 9 Feb 2021 16:37:06 +0000
Subject: [PATCH] fix int32 and included overwrite flag

---
 README.md                  | 11 +++---
 pk_classifier/bootstrap.py | 32 +++++++++++++++++-
 scripts/bootstrap_bow.py   | 69 ++++++++++++++++----------------------
 scripts/bootstrap_dist.py  | 50 +++++++++++----------------
 4 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index 898d6f6..70ffeb9 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,6 @@ This repository contains custom pipes and models to classify scientific publicat
 
 The labels assigned to each publication in the training and test sets are available in CSV format at the [labels folder](https://github.com/fgh95/PKDocClassifier/tree/master/data/labels). We also provide the textual fields from each publication after being parsed at the [subsets folder](https://github.com/fgh95/PKDocClassifier/tree/master/data/subsets).
 
-
-
 ## Reproduce our results
 
 ### 1. Installing dependencies 
@@ -73,7 +71,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
       --input-dir data/encoded/fields \
       --output-dir data/results/fields \
       --output-dir-bootstrap data/results/fields/bootstrap \
-      --path-labels data/labels/dev_data.csv
+      --path-labels data/labels/dev_data.csv \
+      --overwrite True
    ````
 
 3. Bootstrap n-grams (~3h on 12 threads, requires at least 16GB of RAM)
@@ -83,7 +82,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
       --input-dir data/encoded/ngrams \
       --output-dir data/results/ngrams \
       --output-dir-bootstrap data/results/ngrams/bootstrap \
-      --path-labels data/labels/dev_data.csv
+      --path-labels data/labels/dev_data.csv \
+      --overwrite True
    ````
 
 4. Display results
@@ -152,7 +152,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
       --output-dir data/results/distributional \
       --output-dir-bootstrap data/results/distributional/bootstrap \
       --path-labels data/labels/dev_data.csv \
-      --path-optimal-bow data/encoded/ngrams/dev_unigrams.parquet
+      --path-optimal-bow data/encoded/ngrams/dev_unigrams.parquet \
+      --overwrite True
    ````
    
    ````
diff --git a/pk_classifier/bootstrap.py b/pk_classifier/bootstrap.py
index a39a6c0..9d3406f 100644
--- a/pk_classifier/bootstrap.py
+++ b/pk_classifier/bootstrap.py
@@ -1,5 +1,5 @@
 import argparse
-
+from sklearn.model_selection import train_test_split
 from sklearn.base import BaseEstimator, TransformerMixin
 from scipy.sparse import csr_matrix
 import matplotlib.pyplot as plt
@@ -92,6 +92,8 @@ def read_in_bow(path_preproc, path_labels):
     only 2 labels. Returns data as 2 pandas dataframes"""
     features = pd.read_parquet(path_preproc).sort_values(by=['pmid']).reset_index(drop=True)
     labs = pd.read_csv(path_labels).sort_values(by=['pmid']).reset_index(drop=True)
+    features.pmid = features.pmid.astype('int64')
+    labs.pmid = labs.pmid.astype('int64')
     assert features['pmid'].equals(labs['pmid'])
     assert len(labs['label'].unique()) == 2
     return features, labs
@@ -109,3 +111,31 @@ def str2bool(v):
         return False
     else:
         raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def make_ids_per_test(inp_df: pd.DataFrame):
+    ids_per_test = pd.DataFrame(inp_df['pmid'], columns=['pmid'])
+    ids_per_test['Dataset'] = inp_df['Dataset']
+    ids_per_test['Real label'] = inp_df.label
+    ids_per_test['times_correct'] = 0
+    ids_per_test['times_test'] = 0
+    return ids_per_test
+
+
+def split_train_val_test(features, labels, test_size, seed):
+    x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(features,
+                                                                              labels['label'],
+                                                                              labels['pmid'],
+                                                                              test_size=test_size,
+                                                                              shuffle=True,
+                                                                              random_state=seed,
+                                                                              stratify=labels['label'])
+    new_per = len(y_val) / len(y_train)
+    x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train,
+                                                                                 y_train,
+                                                                                 pmids_train,
+                                                                                 test_size=new_per,
+                                                                                 shuffle=True,
+                                                                                 random_state=seed,
+                                                                                 stratify=y_train)
+    return x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test
diff --git a/scripts/bootstrap_bow.py b/scripts/bootstrap_bow.py
index 0be150d..14a7a50 100644
--- a/scripts/bootstrap_bow.py
+++ b/scripts/bootstrap_bow.py
@@ -4,23 +4,18 @@
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.metrics import precision_recall_fscore_support
-from sklearn.model_selection import train_test_split
 import xgboost as xgb
 from tqdm import tqdm
 import argparse
-from pk_classifier.bootstrap import Tokenizer, TextSelector, f1_eval, plot_it, update, read_in_bow
+from pk_classifier.bootstrap import Tokenizer, TextSelector, f1_eval, plot_it, update, read_in_bow, make_ids_per_test, \
+    split_train_val_test
 
 
 def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figure, out_path_bootstrap):
     all_features, all_labs = read_in_bow(input_tuple[0], input_tuple[1])
 
+    ids_per_test = make_ids_per_test(inp_df=all_labs)
     all_metrics_test = []
-    ids_per_test = pd.DataFrame(all_labs['pmid'], columns=['pmid'])
-    ids_per_test['Dataset'] = all_labs['Dataset']
-    ids_per_test['Real label'] = all_labs.label
-    ids_per_test['times_correct'] = 0
-    ids_per_test['times_test'] = 0
-
     optimal_epochs = []
     median_optimal_epochs = []
     median_f1s = []
@@ -33,21 +28,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
         #               Make splits: 60% train, 20% validation, 20% temp test
         # ======================================================================================================
 
-        x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(all_features,
-                                                                                  all_labs['label'],
-                                                                                  all_labs['pmid'],
-                                                                                  test_size=per,
-                                                                                  shuffle=True,
-                                                                                  random_state=rd_seed,
-                                                                                  stratify=all_labs['label'])
-        new_per = len(y_val) / len(y_train)
-        x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train,
-                                                                                     y_train,
-                                                                                     pmids_train,
-                                                                                     test_size=new_per,
-                                                                                     shuffle=True,
-                                                                                     random_state=rd_seed,
-                                                                                     stratify=y_train)
+        x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test = \
+            split_train_val_test(features=all_features, labels=all_labs, test_size=per, seed=rd_seed)
 
         # =====================================================================================================
         #               Decide max number of iterations using early stopping criteria on the validation set
@@ -64,7 +46,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
                                     scale_pos_weight=balancing_factor, nthread=-1)
 
         # Define encoding pipeline
-        EncPip = Pipeline([
+        enc_pip = Pipeline([
             ('encoder', FeatureUnion(transformer_list=[
                 ('bow', Pipeline([
                     ('selector', TextSelector('BoW_Ready', emb=False)),
@@ -75,8 +57,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
             ]))
         ])
 
-        x_train_features = EncPip.fit_transform(x_train)
-        x_val_features = EncPip.transform(x_val)
+        x_train_features = enc_pip.fit_transform(x_train)
+        x_val_features = enc_pip.transform(x_val)
         if a == 0:
             print("Using: ", x_train_features.shape[1], "features")
             a = 1
@@ -96,7 +78,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
         #               Apply predictions to the temp test set
         # ======================================================================================================
 
-        x_test_encoded = EncPip.transform(x_test)
+        x_test_encoded = enc_pip.transform(x_test)
         pred_test = decoder.predict(x_test_encoded)
         test_results = pd.DataFrame(pred_test == y_test.values, columns=['Result'])
         test_results['Result'] = test_results['Result'].astype(int)
@@ -130,18 +112,20 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
     ids_per_val.to_csv(out_path_bootstrap)
 
 
-def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str):
+def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str, overwrite: bool):
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir, exist_ok=True)
     if not os.path.isdir(output_dir_bootstrap):
         os.makedirs(output_dir_bootstrap, exist_ok=True)
 
-    for inp_file in os.listdir(input_dir):
+    inp_dev_files = [inp_file for inp_file in os.listdir(input_dir) if 'test' not in inp_file]
+
+    for inp_file in inp_dev_files:
         inp_path = os.path.join(input_dir, inp_file)
         experiment_name = inp_file.replace("dev_", "").replace(".parquet", "")
         print("================== ", experiment_name, "=============================")
         # Define output
-        if "res_" + experiment_name + ".csv" not in os.listdir(output_dir):
+        if "res_" + experiment_name + ".csv" not in os.listdir(output_dir) or overwrite:
             out_res = os.path.join(output_dir, "res_" + experiment_name + ".csv")
             out_fig = os.path.join(output_dir_bootstrap, "res_" + experiment_name + ".png")
             out_dev = os.path.join(output_dir_bootstrap, "bootstrap_" + experiment_name + ".csv")
@@ -154,23 +138,26 @@ def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels:
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--input-dir", type=str, help="The directory with files containing the encoded "
-                                                            "documents in parquet format. It will iterate over all "
-                                                            "files in this directory")
+    parser.add_argument("--input-dir", type=str, help="The directory with files containing the encoded "
+                                                      "documents in parquet format. It will iterate over all "
+                                                      "files in this directory")
+
+    parser.add_argument("--output-dir", type=str, help="Output directory to save the results of each bootstrap "
+                                                       "iteration in a csv file.")
 
-    parser.add_argument("-o", "--output-dir", type=str, help="Output directory to save the results of each bootstrap "
-                                                             "iteration in a csv file.")
+    parser.add_argument("--output-dir-bootstrap", type=str, help="Output directory to save the boostrap "
+                                                                 "results as the misclassification "
+                                                                 "rates per document during bootstrap.")
 
-    parser.add_argument("-ob", "--output-dir-bootstrap", type=str, help="Output directory to save the boostrap "
-                                                                        "results as the misclassification "
-                                                                        "rates per document during bootstrap.")
+    parser.add_argument("--path-labels", type=str, help="Path to the csv containing the labels of the training "
+                                                        "(dev) set")
 
-    parser.add_argument("-l", "--path-labels", type=str, help="Path to the csv containing the labels of the training "
-                                                              "(dev) set")
+    parser.add_argument("--overwrite", type=bool, help="Whether to overwrite files if results already present in the "
+                                                       "output directory.", default=False)
 
     args = parser.parse_args()
     run(input_dir=args.input_dir, output_dir=args.output_dir, output_dir_bootstrap=args.output_dir_bootstrap,
-        path_labels=args.path_labels)
+        path_labels=args.path_labels, overwrite=args.overwrite)
 
 
 if __name__ == '__main__':
diff --git a/scripts/bootstrap_dist.py b/scripts/bootstrap_dist.py
index d2a8f14..33be28b 100644
--- a/scripts/bootstrap_dist.py
+++ b/scripts/bootstrap_dist.py
@@ -4,25 +4,21 @@
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.metrics import precision_recall_fscore_support
-from sklearn.model_selection import train_test_split
 import xgboost as xgb
 from tqdm import tqdm
 import argparse
 import warnings
-from pk_classifier.bootstrap import Tokenizer, TextSelector, plot_it, f1_eval, update, read_in_distributional, str2bool
+from pk_classifier.bootstrap import Tokenizer, TextSelector, plot_it, f1_eval, update, read_in_distributional, str2bool, \
+    make_ids_per_test, split_train_val_test
 
 
 def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figure, out_path_bootstrap, is_specter,
                  use_bow, path_optimal_bow):
     all_features, all_labs = read_in_distributional(input_tuple[0], input_tuple[1], is_specter, path_optimal_bow)
 
-    all_metrics_test = []
-    ids_per_test = pd.DataFrame(all_labs['pmid'], columns=['pmid'])
-    ids_per_test['Dataset'] = all_labs['Dataset']
-    ids_per_test['Real label'] = all_labs.label
-    ids_per_test['times_correct'] = 0
-    ids_per_test['times_test'] = 0
+    ids_per_test = make_ids_per_test(inp_df=all_labs)
 
+    all_metrics_test = []
     optimal_epochs = []
     median_optimal_epochs = []
     median_f1s = []
@@ -35,21 +31,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
         #               Make splits: 60% train, 20% validation, 20% temp test
         # ======================================================================================================
 
-        x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(all_features,
-                                                                                  all_labs['label'],
-                                                                                  all_labs['pmid'],
-                                                                                  test_size=per,
-                                                                                  shuffle=True,
-                                                                                  random_state=rd_seed,
-                                                                                  stratify=all_labs['label'])
-        new_per = len(y_val) / len(y_train)
-        x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train,
-                                                                                     y_train,
-                                                                                     pmids_train,
-                                                                                     test_size=new_per,
-                                                                                     shuffle=True,
-                                                                                     random_state=rd_seed,
-                                                                                     stratify=y_train)
+        x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test = \
+            split_train_val_test(features=all_features, labels=all_labs, test_size=per, seed=rd_seed)
 
         # =====================================================================================================
         #               Decide max number of iterations using early stopping criteria on the validation set
@@ -67,7 +50,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
 
         if use_bow:
             # Define encoding pipeline
-            EncPip = Pipeline([
+            enc_pip = Pipeline([
                 ('encoder', FeatureUnion(transformer_list=[
                     ('bow', Pipeline([
                         ('selector', TextSelector('BoW_Ready', emb=False)),
@@ -82,7 +65,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
             ])
 
         else:
-            EncPip = Pipeline([
+            enc_pip = Pipeline([
                 ('encoder', FeatureUnion(transformer_list=[
                     ('abs', Pipeline([
                         ('selector', TextSelector('embedding', emb=True))
@@ -90,8 +73,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
                 ]))
             ])
 
-        x_train_features = EncPip.fit_transform(x_train)
-        x_val_features = EncPip.transform(x_val)
+        x_train_features = enc_pip.fit_transform(x_train)
+        x_val_features = enc_pip.transform(x_val)
         if a == 0:
             print("Using: ", x_train_features.shape[1], "features")
 
@@ -111,7 +94,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
         #               Apply predictions to the temp test set
         # ======================================================================================================
 
-        x_test_encoded = EncPip.transform(x_test)
+        x_test_encoded = enc_pip.transform(x_test)
         pred_test = decoder.predict(x_test_encoded)
         test_results = pd.DataFrame(pred_test == y_test.values, columns=['Result'])
         test_results['Result'] = test_results['Result'].astype(int)
@@ -146,7 +129,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
 
 
 def run(is_specter: bool, use_bow: bool, input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str,
-        path_optimal_bow: str):
+        path_optimal_bow: str, overwrite: bool):
     if not os.path.isdir(output_dir):
         os.makedirs(output_dir, exist_ok=True)
     if not os.path.isdir(output_dir_bootstrap):
@@ -158,6 +141,8 @@ def run(is_specter: bool, use_bow: bool, input_dir: str, output_dir: str, output
         inp_files = os.listdir(input_dir)
         repl = ".parquet"
 
+    inp_files = [inp_f for inp_f in inp_files if 'test' not in inp_f]
+
     for inp_file in inp_files:
         inp_path = os.path.join(input_dir, inp_file)
         experiment_name = inp_file.replace("dev_", "").replace(repl, "")
@@ -165,7 +150,7 @@ def run(is_specter: bool, use_bow: bool, input_dir: str, output_dir: str, output
             experiment_name = experiment_name + "_bow"
         print("================== ", experiment_name, "=============================")
         # Define output
-        if "res_" + experiment_name + ".csv" not in os.listdir(output_dir):
+        if "res_" + experiment_name + ".csv" not in os.listdir(output_dir) or overwrite:
             out_res = os.path.join(output_dir, "res_" + experiment_name + ".csv")
             out_fig = os.path.join(output_dir_bootstrap, "res_" + experiment_name + ".png")
             out_dev = os.path.join(output_dir_bootstrap, "bootstrap_" + experiment_name + ".csv")
@@ -210,10 +195,13 @@ def main():
     parser.add_argument("--path-optimal-bow", type=str, help="Path to the parquet file with the optimal BoW features",
                         default="../data/encoded/ngrams/dev_unigrams.parquet")
 
+    parser.add_argument("--overwrite", type=bool, help="Whether to overwrite files if results already present in the "
+                                                       "output directory.", default=False)
+
     args = parser.parse_args()
     run(is_specter=args.is_specter, use_bow=args.use_bow, input_dir=args.input_dir, output_dir=args.output_dir,
         output_dir_bootstrap=args.output_dir_bootstrap, path_labels=args.path_labels,
-        path_optimal_bow=args.path_optimal_bow)
+        path_optimal_bow=args.path_optimal_bow, overwrite=args.overwrite)
 
 
 if __name__ == '__main__':