Skip to content

Commit

Permalink
fix int32 and included overwrite flag
Browse files Browse the repository at this point in the history
  • Loading branch information
ferran committed Feb 9, 2021
1 parent 9de1d70 commit b2829cd
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 78 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ This repository contains custom pipes and models to classify scientific publicat

The labels assigned to each publication in the training and test sets are available in CSV format at the [labels folder](https://github.com/fgh95/PKDocClassifier/tree/master/data/labels). We also provide the textual fields from each publication after being parsed at the [subsets folder](https://github.com/fgh95/PKDocClassifier/tree/master/data/subsets).



## Reproduce our results

### 1. Installing dependencies
Expand Down Expand Up @@ -73,7 +71,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
--input-dir data/encoded/fields \
--output-dir data/results/fields \
--output-dir-bootstrap data/results/fields/bootstrap \
--path-labels data/labels/dev_data.csv
--path-labels data/labels/dev_data.csv \
--overwrite True
````

3. Bootstrap n-grams (~3h on 12 threads, requires at least 16GB of RAM)
Expand All @@ -83,7 +82,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
--input-dir data/encoded/ngrams \
--output-dir data/results/ngrams \
--output-dir-bootstrap data/results/ngrams/bootstrap \
--path-labels data/labels/dev_data.csv
--path-labels data/labels/dev_data.csv \
--overwrite True
````

4. Display results
Expand Down Expand Up @@ -152,7 +152,8 @@ This should generate the files at [data/subsets/](https://github.com/fgh95/PKDoc
--output-dir data/results/distributional \
--output-dir-bootstrap data/results/distributional/bootstrap \
--path-labels data/labels/dev_data.csv \
--path-optimal-bow data/encoded/ngrams/dev_unigrams.parquet
--path-optimal-bow data/encoded/ngrams/dev_unigrams.parquet \
--overwrite True
````

````
Expand Down
32 changes: 31 additions & 1 deletion pk_classifier/bootstrap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import argparse

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
Expand Down Expand Up @@ -92,6 +92,8 @@ def read_in_bow(path_preproc, path_labels):
only 2 labels. Returns data as 2 pandas dataframes"""
features = pd.read_parquet(path_preproc).sort_values(by=['pmid']).reset_index(drop=True)
labs = pd.read_csv(path_labels).sort_values(by=['pmid']).reset_index(drop=True)
features.pmid = features.pmid.astype('int64')
labs.pmid = labs.pmid.astype('int64')
assert features['pmid'].equals(labs['pmid'])
assert len(labs['label'].unique()) == 2
return features, labs
Expand All @@ -109,3 +111,31 @@ def str2bool(v):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')


def make_ids_per_test(inp_df: pd.DataFrame):
ids_per_test = pd.DataFrame(inp_df['pmid'], columns=['pmid'])
ids_per_test['Dataset'] = inp_df['Dataset']
ids_per_test['Real label'] = inp_df.label
ids_per_test['times_correct'] = 0
ids_per_test['times_test'] = 0
return ids_per_test


def split_train_val_test(features, labels, test_size, seed):
x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(features,
labels['label'],
labels['pmid'],
test_size=test_size,
shuffle=True,
random_state=seed,
stratify=labels['label'])
new_per = len(y_val) / len(y_train)
x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train,
y_train,
pmids_train,
test_size=new_per,
shuffle=True,
random_state=seed,
stratify=y_train)
return x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test
69 changes: 28 additions & 41 deletions scripts/bootstrap_bow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,18 @@
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tqdm import tqdm
import argparse
from pk_classifier.bootstrap import Tokenizer, TextSelector, f1_eval, plot_it, update, read_in_bow
from pk_classifier.bootstrap import Tokenizer, TextSelector, f1_eval, plot_it, update, read_in_bow, make_ids_per_test, \
split_train_val_test


def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figure, out_path_bootstrap):
all_features, all_labs = read_in_bow(input_tuple[0], input_tuple[1])

ids_per_test = make_ids_per_test(inp_df=all_labs)
all_metrics_test = []
ids_per_test = pd.DataFrame(all_labs['pmid'], columns=['pmid'])
ids_per_test['Dataset'] = all_labs['Dataset']
ids_per_test['Real label'] = all_labs.label
ids_per_test['times_correct'] = 0
ids_per_test['times_test'] = 0

optimal_epochs = []
median_optimal_epochs = []
median_f1s = []
Expand All @@ -33,21 +28,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
# Make splits: 60% train, 20% validation, 20% temp test
# ======================================================================================================

x_train, x_val, y_train, y_val, pmids_train, pmids_val = train_test_split(all_features,
all_labs['label'],
all_labs['pmid'],
test_size=per,
shuffle=True,
random_state=rd_seed,
stratify=all_labs['label'])
new_per = len(y_val) / len(y_train)
x_train, x_test, y_train, y_test, pmids_train, pmids_test = train_test_split(x_train,
y_train,
pmids_train,
test_size=new_per,
shuffle=True,
random_state=rd_seed,
stratify=y_train)
x_train, x_val, x_test, y_train, y_val, y_test, pmids_train, pmids_val, pmids_test = \
split_train_val_test(features=all_features, labels=all_labs, test_size=per, seed=rd_seed)

# =====================================================================================================
# Decide max number of iterations using early stopping criteria on the validation set
Expand All @@ -64,7 +46,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
scale_pos_weight=balancing_factor, nthread=-1)

# Define encoding pipeline
EncPip = Pipeline([
enc_pip = Pipeline([
('encoder', FeatureUnion(transformer_list=[
('bow', Pipeline([
('selector', TextSelector('BoW_Ready', emb=False)),
Expand All @@ -75,8 +57,8 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
]))
])

x_train_features = EncPip.fit_transform(x_train)
x_val_features = EncPip.transform(x_val)
x_train_features = enc_pip.fit_transform(x_train)
x_val_features = enc_pip.transform(x_val)
if a == 0:
print("Using: ", x_train_features.shape[1], "features")
a = 1
Expand All @@ -96,7 +78,7 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
# Apply predictions to the temp test set
# ======================================================================================================

x_test_encoded = EncPip.transform(x_test)
x_test_encoded = enc_pip.transform(x_test)
pred_test = decoder.predict(x_test_encoded)
test_results = pd.DataFrame(pred_test == y_test.values, columns=['Result'])
test_results['Result'] = test_results['Result'].astype(int)
Expand Down Expand Up @@ -130,18 +112,20 @@ def process_them(input_tuple, rounds, test_prop, out_path_results, out_path_figu
ids_per_val.to_csv(out_path_bootstrap)


def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str):
def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels: str, overwrite: bool):
if not os.path.isdir(output_dir):
os.makedirs(output_dir, exist_ok=True)
if not os.path.isdir(output_dir_bootstrap):
os.makedirs(output_dir_bootstrap, exist_ok=True)

for inp_file in os.listdir(input_dir):
inp_dev_files = [inp_file for inp_file in os.listdir(input_dir) if 'test' not in inp_file]

for inp_file in inp_dev_files:
inp_path = os.path.join(input_dir, inp_file)
experiment_name = inp_file.replace("dev_", "").replace(".parquet", "")
print("================== ", experiment_name, "=============================")
# Define output
if "res_" + experiment_name + ".csv" not in os.listdir(output_dir):
if "res_" + experiment_name + ".csv" not in os.listdir(output_dir) or overwrite:
out_res = os.path.join(output_dir, "res_" + experiment_name + ".csv")
out_fig = os.path.join(output_dir_bootstrap, "res_" + experiment_name + ".png")
out_dev = os.path.join(output_dir_bootstrap, "bootstrap_" + experiment_name + ".csv")
Expand All @@ -154,23 +138,26 @@ def run(input_dir: str, output_dir: str, output_dir_bootstrap: str, path_labels:

def main():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input-dir", type=str, help="The directory with files containing the encoded "
"documents in parquet format. It will iterate over all "
"files in this directory")
parser.add_argument("--input-dir", type=str, help="The directory with files containing the encoded "
"documents in parquet format. It will iterate over all "
"files in this directory")

parser.add_argument("--output-dir", type=str, help="Output directory to save the results of each bootstrap "
"iteration in a csv file.")

parser.add_argument("-o", "--output-dir", type=str, help="Output directory to save the results of each bootstrap "
"iteration in a csv file.")
parser.add_argument("--output-dir-bootstrap", type=str, help="Output directory to save the boostrap "
"results as the misclassification "
"rates per document during bootstrap.")

parser.add_argument("-ob", "--output-dir-bootstrap", type=str, help="Output directory to save the boostrap "
"results as the misclassification "
"rates per document during bootstrap.")
parser.add_argument("--path-labels", type=str, help="Path to the csv containing the labels of the training "
"(dev) set")

parser.add_argument("-l", "--path-labels", type=str, help="Path to the csv containing the labels of the training "
"(dev) set")
parser.add_argument("--overwrite", type=bool, help="Whether to overwrite files if results already present in the "
"output directory.", default=False)

args = parser.parse_args()
run(input_dir=args.input_dir, output_dir=args.output_dir, output_dir_bootstrap=args.output_dir_bootstrap,
path_labels=args.path_labels)
path_labels=args.path_labels, overwrite=args.overwrite)


if __name__ == '__main__':
Expand Down

0 comments on commit b2829cd

Please sign in to comment.