# Imports

In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import xgboost as xgb
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error, ConfusionMatrixDisplay
from sklearn.model_selection import ParameterSampler, RandomizedSearchCV
from scipy.stats.distributions import expon
from scipy.stats import uniform
from sklearn.metrics import ConfusionMatrixDisplay
import os
import glob
import re
import h2o
from h2o.automl import H2OAutoML

from tqdm import tqdm
import pickle
from autoxgb import AutoXGB
from autoxgb.cli.predict import PredictAutoXGBCommand

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
%reload_ext autoreload
%autoreload 2
import gps_lib.parse_raw_utils as p_utils
import gps_lib.exp_utils as e_utils
import gps_lib.data_sets as ds
import gps_lib.experiment as exp


%matplotlib inline
rng = np.random.RandomState(42)

# Load Data Classes

In [2]:
species_dict_path= "../resources/species_dict.json"
resources = {
    'PATAKI': {
        'geno': '../resources/28.12.21/Pataki_paper/PATAKI_final_for_Amit.2021.12.28/Pataki.results.for.Amit',
        'pheno': '../resources/26.12.21/Pataki_paper/AST_2548_all', 
        'run2bio': '../resources/28.12.21/Pataki_paper/PATAKI_final_for_Amit.2021.12.28/PATAKI_full_SAM_and_SRR_list.xlsx',
        'filter_list': '../resources/28.12.21/Pataki_paper/PATAKI_final_for_Amit.2021.12.28/PATAKI_filtered_SRR_list_for_Amit.xlsx',
    },
    'VAMP': {
        'geno': '../resources/28.12.21/VAMPr_3400samples/VAMP_final_for_Amit.2021.12.28/VAMPr.results.for.Amit',
        'pheno': '../resources/28.12.21/VAMPr_3400samples/VAMP_final_for_Amit.2021.12.28/VAMP_full_AST_data',
        'run2bio': '../resources/28.12.21//VAMPr_3400samples/VAMP_final_for_Amit.2021.12.28/VAMP_full_SAM_and_SRR_list.csv',
        'filter_list': '../resources/28.12.21//VAMPr_3400samples/VAMP_final_for_Amit.2021.12.28/VAMP_filtered_SRR_list.20211228.xlsx',
    },
    'PA': {
        'geno': "../resources/data/PA.dataset.400.for.Amit/",
        'pheno': '../resources/data/Pseudomonas_paper_AST.xlsx', 
        'run2bio': '../resources/data/PA.dataset.400.RunInfo.xlsx',
        'filter_list': '',
    },
    'PATRIC': {
        'geno': '/sise/liorrk-group/AmitdanwMaranoMotroy/all.QC.passed.spades.20220313/',
        'pheno': '../resources/data/PATRIC_AMR_ESKAPE_etal_with_numericalAST_only.xlsx', 
        'run2bio': '../resources/data/PATRIC_genome_final_db.20220223.xlsx',
        'filter_list': '',
    },
}

In [3]:
data = ds.CollectionDataSet(all_path_dict=resources)

  self._load_pheno()


# Run models

## AXGB

In [19]:
model_param = {
    'model': 'autoxgb',
    'train_time': 1,
    'max_models': 100,
}

In [11]:
['Pseudomonas aeruginosa']+list(np.arange(5))

['Pseudomonas aeruginosa', 0, 1, 2, 3, 4]

In [None]:
ds_param = None
anti_list = list(np.arange(20))
species_list = ['Pseudomonas aeruginosa']+list(np.arange(5))
exp.run_exp(data, model_param, ds_param, antibiotic=anti_list, species=species_list)

2022-06-10 21:43:03.324 | INFO     | autoxgb.autoxgb:__post_init__:42 - Output directory: ../experiments/Pseudomonas aeruginosa_ceftazidime_species_sep_True antibiotic_sep_True/model_autoxgb_train_time_1_max_models_100/model
2022-06-10 21:43:03.326 | INFO     | autoxgb.autoxgb:_process_data:149 - Reading training data
2022-06-10 21:43:04.362 | INFO     | autoxgb.utils:reduce_memory_usage:50 - Mem. usage decreased to 0.95 Mb (74.9% reduction)
2022-06-10 21:43:04.363 | INFO     | autoxgb.autoxgb:_determine_problem_type:140 - Problem type: single_column_regression
  df[self.idx] = np.arange(len(df))
2022-06-10 21:43:05.286 | INFO     | autoxgb.utils:reduce_memory_usage:50 - Mem. usage decreased to 0.25 Mb (74.9% reduction)
2022-06-10 21:43:05.287 | INFO     | autoxgb.autoxgb:_create_folds:58 - Creating folds
  train_df["kfold"] = -1
2022-06-10 21:43:05.343 | INFO     | autoxgb.autoxgb:_process_data:195 - Found 0 categorical features.
2022-06-10 21:43:07.945 | INFO     | autoxgb.autoxgb:_p