In [2]:
import allel
import numpy as np
import pandas as pd
from functools import reduce
import glob

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns
from numpy import nan
from sklearn.preprocessing import OrdinalEncoder

# Parse the datafiles and save into dataframes

In [2]:
folders = ['real1', 'real2_part1', 'real2_part2'] + [f'syn{i}' for i in range(1,6)]

data_dicts = {}
final_target_features = {'freebayes': ['SOMATIC', 'QUAL', 'ID', 'QR', 'DPB', 'CHROM', 'POS', 'is_snp'],
 'mutect2': ['FILTER_PASS','ID','ECNT','MQ','FILTER_t_lod_fstar','HCNT','DP','CHROM','POS','is_snp',],
 'vardict': ['FILTER_PASS','STATUS','ID','SSF','VD','SOMATIC','CHROM','POS','is_snp'],
 'varscan': ['SOMATIC', 'ID', 'SPV', 'FILTER_PASS', 'CHROM', 'POS', 'is_snp']}

In [43]:
for f in folders:
    print(f'generating df of {f}')
    #read files
    if f != 'real2_part1' and f!= 'real2_part2':
        varscan_sub = allel.vcf_to_dataframe(f"Data/{f}/{f}-varscan.vcf.gz", fields = final_target_features['varscan'])
        freebayes_sub = allel.vcf_to_dataframe(f"Data/{f}/{f}-freebayes.vcf.gz", fields = final_target_features['freebayes'])
        mutect2_sub = allel.vcf_to_dataframe(f"Data/{f}/{f}-mutect2.vcf.gz", fields = final_target_features['mutect2'])
        vardict_sub = allel.vcf_to_dataframe(f"Data/{f}/{f}-vardict.vcf.gz", fields = final_target_features['vardict'])
    elif f == 'real2_part1':
        varscan_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_varscan_chr1to5.vcf.gz", fields = final_target_features['varscan'])
        freebayes_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_freebayes_chr1to5.vcf.gz", fields = final_target_features['freebayes'])
        mutect2_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_mutect_chr1to5.vcf.gz", fields = final_target_features['mutect2'])
        vardict_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_vardict_chr1to5.vcf.gz", fields = final_target_features['vardict'])
    else:
        varscan_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_varscan_rest.vcf.gz", fields = final_target_features['varscan'])
        freebayes_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_freebayes_rest.vcf.gz", fields = final_target_features['freebayes'])
        mutect2_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_mutect_rest.vcf.gz", fields = final_target_features['mutect2'])
        vardict_sub = allel.vcf_to_dataframe(f"Data/{f}/real2_vardict_rest.vcf.gz", fields = final_target_features['vardict'])

    #isolate snp only
    varscan = varscan_sub[varscan_sub.is_snp == True]
    freebayes = freebayes_sub[freebayes_sub.is_snp == True]
    mutect2 = mutect2_sub[mutect2_sub.is_snp == True]
    vardict = vardict_sub[vardict_sub.is_snp == True]

    #edit column lables of each df
    lst_dfs = [varscan_sub,freebayes_sub,mutect2_sub,vardict_sub]
    suffix = ['vs','fb','m2','vd']
    keep_same = {'CHROM', 'POS'}
    
    for i in range(len(lst_dfs)):
        df = lst_dfs[i]
        df.columns = ['{}{}'.format(c, '' if c in keep_same else '_'+suffix[i]) for c in df.columns]
    
    #binary ID values
    for s in suffix:
        id_var = f'ID_{s}'
        new_id = f'ID_binary_{s}'
        df[new_id] = df[id_var] != '.'

    df = df.drop(columns=[f'ID_{alg}' for s in suffix], inplace=True)
    
    #merge the dfs together
    merged_df = reduce(lambda left, right: pd.merge(left, right,on =['CHROM', 'POS'],
                                            how = 'outer', suffixes = ('', '')),lst_dfs)
    merged_df = merged_df.drop(['is_snp_vd','is_snp_fb','is_snp_m2','is_snp_vs'], axis=1)

    #add in the trute values in *truth.bed files
    if f == 'real2_part2':
        print('real2_part2, no truth values')
        data_dicts[f] = merged_df
        merged_df.to_csv(f"parse_files_new_features/{f}_merged_df.csv")
        continue
    if f != 'real2_part1':
        truth_labels = pd.read_csv(f'Data/{f}/{f}_truth.bed', sep = '\t', names = ['Chromo', 'start', 'end'])
    else:
        truth_labels = pd.read_csv(f'Data/{f}/real2_truth_chr1to5.bed', sep = '\t', names = ['Chromo', 'start', 'end'])
        truth_labels = truth_labels.astype({'Chromo': 'object', 'start': 'int32', 'end': 'int32'})
    if sum(set(truth_labels.start == truth_labels.end)): #check the start and end pos are the same
        truth_labels = truth_labels[['Chromo', 'start']]
        truth_labels['truth'] = 1
        sub_truth = truth_labels.rename(columns = {'Chromo': 'CHROM', 'start': 'POS'})
        
        #what if all 4 callers did not call the pos in truth......
        final_df = merged_df.merge(sub_truth, on=['CHROM','POS'], how = 'outer')
        final_df['truth'].fillna(0, inplace = True)
    else:
        print('Not an SNP in truth.bed file')
        continue

    #save the files
    print(f'generated {f} df')
    data_dicts[f] = final_df
    final_df.to_csv(f"parse_files_new_features/{f}_merged_df.csv")
    break

generating df of real1


KeyboardInterrupt: 

In [60]:
data_dicts['real1'].dtypes.to_dict()

{'SOMATIC_vs': CategoricalDtype(categories=['False', 'True'], ordered=False),
 'SPV_vs': dtype('float32'),
 'FILTER_PASS_vs': CategoricalDtype(categories=['False', 'True'], ordered=False),
 'CHROM': CategoricalDtype(categories=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
                   '12', '13', '14', '15', '16', '17', '18', '19', '20', '21',
                   '22', 'X', 'Y', 'GL000191.1', 'GL000196.1', 'GL000197.1',
                   'GL000198.1', 'GL000201.1', 'GL000202.1', 'GL000203.1',
                   'GL000204.1', 'GL000206.1', 'GL000207.1', 'GL000208.1',
                   'GL000210.1', 'GL000214.1', 'GL000226.1', 'GL000227.1',
                   'GL000228.1', 'GL000229.1', 'GL000230.1', 'GL000231.1',
                   'GL000232.1', 'GL000233.1', 'GL000234.1', 'GL000235.1',
                   'GL000236.1', 'GL000237.1', 'GL000238.1', 'GL000239.1',
                   'GL000240.1', 'GL000241.1', 'GL000242.1', 'GL000243.1',
                   'GL000244.1', '

# Generate training data

In [7]:
# types = {'SOMATIC_vs': 'O', 'ID_vs': 'O', 'SPV_vs': 'float32', 'FILTER_PASS_vs': 'O', 'CHROM': 'O', 'POS': 'int64', 
#          'SOMATIC_fb': 'O', 'QUAL_fb': 'float32', 'ID_fb': 'O', 'QR_fb': 'float64', 'DPB_fb': 'float32', 
#          'FILTER_PASS_m2': 'O', 'ID_m2': 'O', 'ECNT_m2': 'O', 'MQ_m2': 'float32', 'FILTER_t_lod_fstar_m2': 'O', 
#          'HCNT_m2': 'O', 'DP_m2': 'float64', 'FILTER_PASS_vd': 'O', 'STATUS_vd': 'O', 'ID_vd': 'O', 'SSF_vd': 'float32', 
#          'VD_vd': 'float64', 'SOMATIC_vd': 'O', 'truth': 'float64'}

# for col in X.columns:
#     if X[col].dtype == 'O':
#         X[col] = X[col].astype('category')

types = {'SOMATIC_vs': 'category', 'ID_binary_vs': 'category', 'SPV_vs': 'float32', 'FILTER_PASS_vs': 'category', 'CHROM': 'category', 'POS': 'int64', 
         'SOMATIC_fb': 'category', 'QUAL_fb': 'float32', 'ID_binary_fb': 'category', 'QR_fb': 'float64', 'DPB_fb': 'float32', 
         'FILTER_PASS_m2': 'category', 'ID_binary_m2': 'category', 'ECNT_m2': 'category', 'MQ_m2': 'float32', 'FILTER_t_lod_fstar_m2': 'category', 
         'HCNT_m2': 'category', 'DP_m2': 'float64', 'FILTER_PASS_vd': 'category', 'STATUS_vd': 'category', 'ID_binary_vd': 'category', 'SSF_vd': 'float32', 
         'VD_vd': 'float64', 'SOMATIC_vd': 'category', 'truth': 'float64'}

In [8]:
#if haven't read the files yet

folders = ['real1', 'real2_part1', 'real2_part2'] + [f'syn{i}' for i in range(1,6)]

data_dicts = {}

for i in folders:
    data_dicts[i] = pd.read_csv(f'parse_files_new_features/{i}_merged_df.csv', index_col = 0, dtype = types)

data_dicts.keys()

dict_keys(['real1', 'real2_part1', 'real2_part2', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'])

# Testing what data to be used

In [6]:
synthetic = [f'syn{i}' for i in range(1,6)]

conditions = [['real1'], ['real1', 'real2_part1']] + [['real1', i] for i in synthetic] + [['real1', 'real2_part1', i] for i in synthetic]

random_states = 1

print(conditions)

[['real1'], ['real1', 'real2_part1'], ['real1', 'syn1'], ['real1', 'syn2'], ['real1', 'syn3'], ['real1', 'syn4'], ['real1', 'syn5'], ['real1', 'real2_part1', 'syn1'], ['real1', 'real2_part1', 'syn2'], ['real1', 'real2_part1', 'syn3'], ['real1', 'real2_part1', 'syn4'], ['real1', 'real2_part1', 'syn5']]


In [7]:
#key = condition
#value = results dataframe: rows = datasets that were tested on, cols = precision, recall, F1, 
results = {}

In [29]:
for cond in conditions[1:]:
    #concatenate the datasets if c > 1
    if len(cond) == 1:
        df = data_dicts[cond[0]]
    else:
        df = pd.concat([data_dicts[data] for data in cond], ignore_index = True)
        #concatenation migh overwrite some category dtypes
        for col in df.columns:
            if df[col].dtype == 'O':
                df[col] = df[col].astype('category')

    X = df[df.columns[~df.columns.isin(['truth','POS','CHROM'])]]

    y = df['truth'] 

    #split into test, train, and validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = random_states)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = random_states)
    eval_set = [(X_train, y_train), (X_val, y_val)]

    #do the fitting
    model = XGBClassifier(eval_metric='logloss',enable_categorical=True,tree_method='approx')
    model.fit(X_train, y_train, early_stopping_rounds = 50, eval_set = eval_set, verbose = True)

    stats_dict = {}

    y_pred = model.predict(X_test)
    stats = [metrics.precision_score, metrics.recall_score, metrics.f1_score]

    stats_dict['test'] = [g(y_test, y_pred) for g in stats]

    for name, data_set in data_dicts.items():
        if name == 'real2_part2':
            continue
        X_sub = data_set[data_set.columns[~data_set.columns.isin(['truth','POS','CHROM'])]]
        y_sub = data_set['truth']
        y_sub_pred = model.predict(X_sub)
        stats_dict[name] = [g(y_sub, y_sub_pred) for g in stats]

    stats_df = pd.DataFrame.from_dict(stats_dict)
    stats_df.index = ['precision', 'recall', 'f1']

    cond = '-'.join(cond)

    results[cond] = stats_df
    stats_df.to_csv('data_set_test/all_models_result.csv', mode = 'a', header = False)
    model.save_model(f'data_set_test/trained_models/{cond}.json')



[0]	validation_0-logloss:0.43760	validation_1-logloss:0.43760
[1]	validation_0-logloss:0.29645	validation_1-logloss:0.29646
[2]	validation_0-logloss:0.20752	validation_1-logloss:0.20753
[3]	validation_0-logloss:0.14803	validation_1-logloss:0.14805
[4]	validation_0-logloss:0.10687	validation_1-logloss:0.10689
[5]	validation_0-logloss:0.07779	validation_1-logloss:0.07781
[6]	validation_0-logloss:0.05678	validation_1-logloss:0.05681
[7]	validation_0-logloss:0.04170	validation_1-logloss:0.04174
[8]	validation_0-logloss:0.03072	validation_1-logloss:0.03076
[9]	validation_0-logloss:0.02269	validation_1-logloss:0.02273
[10]	validation_0-logloss:0.01678	validation_1-logloss:0.01682
[11]	validation_0-logloss:0.01244	validation_1-logloss:0.01249
[12]	validation_0-logloss:0.00923	validation_1-logloss:0.00928
[13]	validation_0-logloss:0.00687	validation_1-logloss:0.00693
[14]	validation_0-logloss:0.00513	validation_1-logloss:0.00519
[15]	validation_0-logloss:0.00383	validation_1-logloss:0.00389
[1



[0]	validation_0-logloss:0.43754	validation_1-logloss:0.43754
[1]	validation_0-logloss:0.29636	validation_1-logloss:0.29637
[2]	validation_0-logloss:0.20740	validation_1-logloss:0.20742
[3]	validation_0-logloss:0.14788	validation_1-logloss:0.14791
[4]	validation_0-logloss:0.10670	validation_1-logloss:0.10673
[5]	validation_0-logloss:0.07759	validation_1-logloss:0.07762
[6]	validation_0-logloss:0.05674	validation_1-logloss:0.05677
[7]	validation_0-logloss:0.04166	validation_1-logloss:0.04169
[8]	validation_0-logloss:0.03067	validation_1-logloss:0.03071
[9]	validation_0-logloss:0.02264	validation_1-logloss:0.02268
[10]	validation_0-logloss:0.01674	validation_1-logloss:0.01678
[11]	validation_0-logloss:0.01240	validation_1-logloss:0.01244
[12]	validation_0-logloss:0.00920	validation_1-logloss:0.00924
[13]	validation_0-logloss:0.00684	validation_1-logloss:0.00688
[14]	validation_0-logloss:0.00510	validation_1-logloss:0.00514
[15]	validation_0-logloss:0.00380	validation_1-logloss:0.00385
[1



[0]	validation_0-logloss:0.43754	validation_1-logloss:0.43754
[1]	validation_0-logloss:0.29636	validation_1-logloss:0.29637
[2]	validation_0-logloss:0.20741	validation_1-logloss:0.20742
[3]	validation_0-logloss:0.14789	validation_1-logloss:0.14791
[4]	validation_0-logloss:0.10671	validation_1-logloss:0.10672
[5]	validation_0-logloss:0.07761	validation_1-logloss:0.07762
[6]	validation_0-logloss:0.05676	validation_1-logloss:0.05678
[7]	validation_0-logloss:0.04167	validation_1-logloss:0.04170
[8]	validation_0-logloss:0.03069	validation_1-logloss:0.03071
[9]	validation_0-logloss:0.02265	validation_1-logloss:0.02268
[10]	validation_0-logloss:0.01675	validation_1-logloss:0.01678
[11]	validation_0-logloss:0.01241	validation_1-logloss:0.01244
[12]	validation_0-logloss:0.00921	validation_1-logloss:0.00924
[13]	validation_0-logloss:0.00685	validation_1-logloss:0.00688
[14]	validation_0-logloss:0.00511	validation_1-logloss:0.00514
[15]	validation_0-logloss:0.00381	validation_1-logloss:0.00385
[1



[0]	validation_0-logloss:0.43759	validation_1-logloss:0.43760
[1]	validation_0-logloss:0.29645	validation_1-logloss:0.29646
[2]	validation_0-logloss:0.20752	validation_1-logloss:0.20752
[3]	validation_0-logloss:0.14802	validation_1-logloss:0.14803
[4]	validation_0-logloss:0.10685	validation_1-logloss:0.10686
[5]	validation_0-logloss:0.07776	validation_1-logloss:0.07777
[6]	validation_0-logloss:0.05692	validation_1-logloss:0.05693
[7]	validation_0-logloss:0.04184	validation_1-logloss:0.04186
[8]	validation_0-logloss:0.03087	validation_1-logloss:0.03088
[9]	validation_0-logloss:0.02284	validation_1-logloss:0.02286
[10]	validation_0-logloss:0.01695	validation_1-logloss:0.01697
[11]	validation_0-logloss:0.01261	validation_1-logloss:0.01263
[12]	validation_0-logloss:0.00942	validation_1-logloss:0.00944
[13]	validation_0-logloss:0.00706	validation_1-logloss:0.00708
[14]	validation_0-logloss:0.00532	validation_1-logloss:0.00534
[15]	validation_0-logloss:0.00403	validation_1-logloss:0.00405
[1



[0]	validation_0-logloss:0.43803	validation_1-logloss:0.43805
[1]	validation_0-logloss:0.29713	validation_1-logloss:0.29717
[2]	validation_0-logloss:0.20838	validation_1-logloss:0.20842
[3]	validation_0-logloss:0.14902	validation_1-logloss:0.14907
[4]	validation_0-logloss:0.10795	validation_1-logloss:0.10802
[5]	validation_0-logloss:0.07894	validation_1-logloss:0.07902
[6]	validation_0-logloss:0.05817	validation_1-logloss:0.05826
[7]	validation_0-logloss:0.04315	validation_1-logloss:0.04325
[8]	validation_0-logloss:0.03222	validation_1-logloss:0.03233
[9]	validation_0-logloss:0.02423	validation_1-logloss:0.02434
[10]	validation_0-logloss:0.01837	validation_1-logloss:0.01848
[11]	validation_0-logloss:0.01406	validation_1-logloss:0.01418
[12]	validation_0-logloss:0.01087	validation_1-logloss:0.01099
[13]	validation_0-logloss:0.00852	validation_1-logloss:0.00865
[14]	validation_0-logloss:0.00678	validation_1-logloss:0.00691
[15]	validation_0-logloss:0.00546	validation_1-logloss:0.00559
[1



[0]	validation_0-logloss:0.43774	validation_1-logloss:0.43775
[1]	validation_0-logloss:0.29667	validation_1-logloss:0.29669
[2]	validation_0-logloss:0.20780	validation_1-logloss:0.20782
[3]	validation_0-logloss:0.14835	validation_1-logloss:0.14838
[4]	validation_0-logloss:0.10721	validation_1-logloss:0.10725
[5]	validation_0-logloss:0.07815	validation_1-logloss:0.07819
[6]	validation_0-logloss:0.05733	validation_1-logloss:0.05737
[7]	validation_0-logloss:0.04226	validation_1-logloss:0.04232
[8]	validation_0-logloss:0.03130	validation_1-logloss:0.03136
[9]	validation_0-logloss:0.02329	validation_1-logloss:0.02335
[10]	validation_0-logloss:0.01740	validation_1-logloss:0.01747
[11]	validation_0-logloss:0.01306	validation_1-logloss:0.01314
[12]	validation_0-logloss:0.00987	validation_1-logloss:0.00996
[13]	validation_0-logloss:0.00750	validation_1-logloss:0.00759
[14]	validation_0-logloss:0.00576	validation_1-logloss:0.00586
[15]	validation_0-logloss:0.00447	validation_1-logloss:0.00457
[1



[0]	validation_0-logloss:0.43757	validation_1-logloss:0.43759
[1]	validation_0-logloss:0.29641	validation_1-logloss:0.29644
[2]	validation_0-logloss:0.20747	validation_1-logloss:0.20751
[3]	validation_0-logloss:0.14797	validation_1-logloss:0.14802
[4]	validation_0-logloss:0.10680	validation_1-logloss:0.10686
[5]	validation_0-logloss:0.07771	validation_1-logloss:0.07778
[6]	validation_0-logloss:0.05687	validation_1-logloss:0.05694
[7]	validation_0-logloss:0.04179	validation_1-logloss:0.04187
[8]	validation_0-logloss:0.03082	validation_1-logloss:0.03090
[9]	validation_0-logloss:0.02279	validation_1-logloss:0.02288
[10]	validation_0-logloss:0.01689	validation_1-logloss:0.01698
[11]	validation_0-logloss:0.01255	validation_1-logloss:0.01265
[12]	validation_0-logloss:0.00936	validation_1-logloss:0.00946
[13]	validation_0-logloss:0.00700	validation_1-logloss:0.00710
[14]	validation_0-logloss:0.00524	validation_1-logloss:0.00533
[15]	validation_0-logloss:0.00382	validation_1-logloss:0.00385
[1



[0]	validation_0-logloss:0.43759	validation_1-logloss:0.43759
[1]	validation_0-logloss:0.29643	validation_1-logloss:0.29643
[2]	validation_0-logloss:0.20750	validation_1-logloss:0.20750
[3]	validation_0-logloss:0.14800	validation_1-logloss:0.14800
[4]	validation_0-logloss:0.10683	validation_1-logloss:0.10683
[5]	validation_0-logloss:0.07774	validation_1-logloss:0.07774
[6]	validation_0-logloss:0.05690	validation_1-logloss:0.05690
[7]	validation_0-logloss:0.04182	validation_1-logloss:0.04182
[8]	validation_0-logloss:0.03084	validation_1-logloss:0.03085
[9]	validation_0-logloss:0.02281	validation_1-logloss:0.02282
[10]	validation_0-logloss:0.01692	validation_1-logloss:0.01693
[11]	validation_0-logloss:0.01259	validation_1-logloss:0.01259
[12]	validation_0-logloss:0.00939	validation_1-logloss:0.00940
[13]	validation_0-logloss:0.00704	validation_1-logloss:0.00705
[14]	validation_0-logloss:0.00529	validation_1-logloss:0.00530
[15]	validation_0-logloss:0.00400	validation_1-logloss:0.00401
[1



[0]	validation_0-logloss:0.43763	validation_1-logloss:0.43763
[1]	validation_0-logloss:0.29651	validation_1-logloss:0.29650
[2]	validation_0-logloss:0.20760	validation_1-logloss:0.20759
[3]	validation_0-logloss:0.14812	validation_1-logloss:0.14810
[4]	validation_0-logloss:0.10696	validation_1-logloss:0.10695
[5]	validation_0-logloss:0.07788	validation_1-logloss:0.07787
[6]	validation_0-logloss:0.05705	validation_1-logloss:0.05703
[7]	validation_0-logloss:0.04199	validation_1-logloss:0.04197
[8]	validation_0-logloss:0.03102	validation_1-logloss:0.03101
[9]	validation_0-logloss:0.02298	validation_1-logloss:0.02297
[10]	validation_0-logloss:0.01701	validation_1-logloss:0.01702
[11]	validation_0-logloss:0.01261	validation_1-logloss:0.01262
[12]	validation_0-logloss:0.00938	validation_1-logloss:0.00940
[13]	validation_0-logloss:0.00701	validation_1-logloss:0.00702
[14]	validation_0-logloss:0.00526	validation_1-logloss:0.00528
[15]	validation_0-logloss:0.00398	validation_1-logloss:0.00399
[1



[0]	validation_0-logloss:0.43800	validation_1-logloss:0.43801
[1]	validation_0-logloss:0.29709	validation_1-logloss:0.29710
[2]	validation_0-logloss:0.20832	validation_1-logloss:0.20834
[3]	validation_0-logloss:0.14895	validation_1-logloss:0.14898
[4]	validation_0-logloss:0.10789	validation_1-logloss:0.10791
[5]	validation_0-logloss:0.07889	validation_1-logloss:0.07891
[6]	validation_0-logloss:0.05811	validation_1-logloss:0.05814
[7]	validation_0-logloss:0.04310	validation_1-logloss:0.04313
[8]	validation_0-logloss:0.03217	validation_1-logloss:0.03220
[9]	validation_0-logloss:0.02419	validation_1-logloss:0.02422
[10]	validation_0-logloss:0.01828	validation_1-logloss:0.01832
[11]	validation_0-logloss:0.01381	validation_1-logloss:0.01387
[12]	validation_0-logloss:0.01047	validation_1-logloss:0.01055
[13]	validation_0-logloss:0.00811	validation_1-logloss:0.00819
[14]	validation_0-logloss:0.00638	validation_1-logloss:0.00645
[15]	validation_0-logloss:0.00509	validation_1-logloss:0.00517
[1



[0]	validation_0-logloss:0.43776	validation_1-logloss:0.43776
[1]	validation_0-logloss:0.29671	validation_1-logloss:0.29671
[2]	validation_0-logloss:0.20784	validation_1-logloss:0.20785
[3]	validation_0-logloss:0.14840	validation_1-logloss:0.14841
[4]	validation_0-logloss:0.10727	validation_1-logloss:0.10729
[5]	validation_0-logloss:0.07822	validation_1-logloss:0.07824
[6]	validation_0-logloss:0.05741	validation_1-logloss:0.05744
[7]	validation_0-logloss:0.04236	validation_1-logloss:0.04239
[8]	validation_0-logloss:0.03141	validation_1-logloss:0.03144
[9]	validation_0-logloss:0.02340	validation_1-logloss:0.02343
[10]	validation_0-logloss:0.01752	validation_1-logloss:0.01756
[11]	validation_0-logloss:0.01320	validation_1-logloss:0.01324
[12]	validation_0-logloss:0.01000	validation_1-logloss:0.01006
[13]	validation_0-logloss:0.00766	validation_1-logloss:0.00771
[14]	validation_0-logloss:0.00588	validation_1-logloss:0.00594
[15]	validation_0-logloss:0.00454	validation_1-logloss:0.00460
[1

In [86]:
# stats = [metrics.precision_score, metrics.recall_score, metrics.f1_score]

# for cond in conditions:
#     cond = '-'.join(cond)
#     model = XGBClassifier(eval_metric='logloss',enable_categorical=True,tree_method='approx')
#     model.load_model(f'data_set_test/trained_models/{cond}.json')
#     print(cond)

#     for name, data_set in data_dicts.items():
#         if name == 'real2_part2':
#             continue
#         print(name)
#         X_sub = data_set[data_set.columns[~data_set.columns.isin(['truth','POS','CHROM'])]]
#         y_sub = data_set['truth']
#         y_sub_pred = model.predict(X_sub)
#         results[cond][1][name] = [g(y_sub, y_sub_pred) for g in stats]

real1
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-real2_part1
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-syn1
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-syn2
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-syn3
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-syn4
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-syn5
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-real2_part1-syn1
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-real2_part1-syn2
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-real2_part1-syn3
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-real2_part1-syn4
real1
real2_part1
syn1
syn2
syn3
syn4
syn5
real1-real2_part1-syn5
real1
real2_part1
syn1
syn2
syn3
syn4
syn5


In [96]:
real2_part1_stats = pd.concat([results[i][1]['real2_part1'] for i in results], axis = 1)
real2_part1_stats.columns = [i for i in results]

#real2_part1_stats.to_csv('data_set_test/models_on_real2_part1.csv')

In [97]:
all_results = pd.concat([results[i][1] for i in results], axis = 0)

all_results.index = pd.MultiIndex.from_arrays([[name for name in results for i in range(3)], all_results.index])

#all_results.to_csv('data_set_test/all_models_result.csv')

## real+syn mixing

In [4]:
synthetic = [f'syn{i}' for i in range(1,6)]

conditions = [['real1'] + synthetic, ['real1', 'real2_part1'] + synthetic,   #all
              ['real1'] + synthetic[:2], ['real1', 'real2_part1'] + synthetic[:2],  #syn1, 2
              ['real1'] + synthetic[2:], ['real1', 'real2_part1'] + synthetic[2:]]  #syn 3, 4, 5

final_conditions = [[c, i] for c in conditions for i in [0.2, 0.33, 0.5, 1]]

random_states = 1

for i in final_conditions:
    print(i)

[['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 0.2]
[['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 0.33]
[['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 0.5]
[['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 1]
[['real1', 'real2_part1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 0.2]
[['real1', 'real2_part1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 0.33]
[['real1', 'real2_part1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 0.5]
[['real1', 'real2_part1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5'], 1]
[['real1', 'syn1', 'syn2'], 0.2]
[['real1', 'syn1', 'syn2'], 0.33]
[['real1', 'syn1', 'syn2'], 0.5]
[['real1', 'syn1', 'syn2'], 1]
[['real1', 'real2_part1', 'syn1', 'syn2'], 0.2]
[['real1', 'real2_part1', 'syn1', 'syn2'], 0.33]
[['real1', 'real2_part1', 'syn1', 'syn2'], 0.5]
[['real1', 'real2_part1', 'syn1', 'syn2'], 1]
[['real1', 'syn3', 'syn4', 'syn5'], 0.2]
[['real1', 'syn3', 'syn4', 'syn5'], 0.33]
[['real1', 'syn3', 'syn4', 'syn5'], 0.5]
[['real1', 'syn3', 'syn4', 'syn5'], 1]
[

In [5]:
#key = condition
#value = results dataframe: rows = datasets that were tested on, cols = precision, recall, F1, 
results2 = {}

In [9]:
for cond in final_conditions[4:]:
    #getting all dfs in the same list with stratified sampling or not
    name_lists = cond[0]
    ratio = cond[1]

    concat_lst = []

    for i in name_lists:
        df = data_dicts[i]
        if 'real' in i:
            concat_lst.append(df)
        elif ratio == 1:
            concat_lst.append(df)
        else:
            stratified = df.groupby('truth', group_keys=False).apply(lambda x: x.sample(frac = ratio, random_state = random_states))
            concat_lst.append(stratified)

    df = pd.concat(concat_lst, ignore_index = True)
        #concatenation migh overwrite some category dtypes
    for col in df.columns:
        if df[col].dtype == 'O':
            df[col] = df[col].astype('category')

    X = df[df.columns[~df.columns.isin(['truth','POS','CHROM'])]]

    y = df['truth'] 

    #split into test, train, and validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = random_states)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = random_states)
    eval_set = [(X_train, y_train), (X_val, y_val)]

    #do the fitting
    model = XGBClassifier(eval_metric='logloss',enable_categorical=True,tree_method='approx')
    model.fit(X_train, y_train, early_stopping_rounds = 50, eval_set = eval_set, verbose = True)

    stats_dict = {}

    y_pred = model.predict(X_test)
    stats = [metrics.precision_score, metrics.recall_score, metrics.f1_score]

    stats_dict['test'] = [g(y_test, y_pred) for g in stats]

    for name, data_set in data_dicts.items():
        if name == 'real2_part2':
            continue
        X_sub = data_set[data_set.columns[~data_set.columns.isin(['truth','POS','CHROM'])]]
        y_sub = data_set['truth']
        y_sub_pred = model.predict(X_sub)
        stats_dict[name] = [g(y_sub, y_sub_pred) for g in stats]

    name = '-'.join(name_lists) + '_' + str(ratio)

    stats_df = pd.DataFrame.from_dict(stats_dict)
    stats_df.index = pd.MultiIndex.from_arrays([[name for i in range(3)], ['precision', 'recall', 'f1']])

    results2[name] = stats_df
    stats_df.to_csv('data_set_test/all_models_result2.csv', mode =  'a', header = False)
    model.save_model(f'data_set_test/trained_models/{name}.json')



[0]	validation_0-logloss:0.43773	validation_1-logloss:0.43772
[1]	validation_0-logloss:0.29666	validation_1-logloss:0.29665
[2]	validation_0-logloss:0.20779	validation_1-logloss:0.20777
[3]	validation_0-logloss:0.14833	validation_1-logloss:0.14832
[4]	validation_0-logloss:0.10720	validation_1-logloss:0.10718
[5]	validation_0-logloss:0.07814	validation_1-logloss:0.07812
[6]	validation_0-logloss:0.05733	validation_1-logloss:0.05731
[7]	validation_0-logloss:0.04228	validation_1-logloss:0.04225
[8]	validation_0-logloss:0.03132	validation_1-logloss:0.03130
[9]	validation_0-logloss:0.02331	validation_1-logloss:0.02329
[10]	validation_0-logloss:0.01744	validation_1-logloss:0.01742
[11]	validation_0-logloss:0.01312	validation_1-logloss:0.01310
[12]	validation_0-logloss:0.00993	validation_1-logloss:0.00992
[13]	validation_0-logloss:0.00754	validation_1-logloss:0.00754
[14]	validation_0-logloss:0.00570	validation_1-logloss:0.00571
[15]	validation_0-logloss:0.00434	validation_1-logloss:0.00435
[1



[0]	validation_0-logloss:0.43776	validation_1-logloss:0.43775
[1]	validation_0-logloss:0.29671	validation_1-logloss:0.29670
[2]	validation_0-logloss:0.20784	validation_1-logloss:0.20783
[3]	validation_0-logloss:0.14840	validation_1-logloss:0.14839
[4]	validation_0-logloss:0.10728	validation_1-logloss:0.10726
[5]	validation_0-logloss:0.07822	validation_1-logloss:0.07820
[6]	validation_0-logloss:0.05741	validation_1-logloss:0.05739
[7]	validation_0-logloss:0.04236	validation_1-logloss:0.04234
[8]	validation_0-logloss:0.03141	validation_1-logloss:0.03139
[9]	validation_0-logloss:0.02340	validation_1-logloss:0.02338
[10]	validation_0-logloss:0.01753	validation_1-logloss:0.01751
[11]	validation_0-logloss:0.01321	validation_1-logloss:0.01320
[12]	validation_0-logloss:0.01002	validation_1-logloss:0.01001
[13]	validation_0-logloss:0.00766	validation_1-logloss:0.00765
[14]	validation_0-logloss:0.00590	validation_1-logloss:0.00589
[15]	validation_0-logloss:0.00452	validation_1-logloss:0.00450
[1



[0]	validation_0-logloss:0.43777	validation_1-logloss:0.43778
[1]	validation_0-logloss:0.29674	validation_1-logloss:0.29675
[2]	validation_0-logloss:0.20788	validation_1-logloss:0.20789
[3]	validation_0-logloss:0.14844	validation_1-logloss:0.14846
[4]	validation_0-logloss:0.10731	validation_1-logloss:0.10734
[5]	validation_0-logloss:0.07826	validation_1-logloss:0.07829
[6]	validation_0-logloss:0.05746	validation_1-logloss:0.05749
[7]	validation_0-logloss:0.04240	validation_1-logloss:0.04244
[8]	validation_0-logloss:0.03145	validation_1-logloss:0.03149
[9]	validation_0-logloss:0.02345	validation_1-logloss:0.02349
[10]	validation_0-logloss:0.01757	validation_1-logloss:0.01762
[11]	validation_0-logloss:0.01325	validation_1-logloss:0.01330
[12]	validation_0-logloss:0.01006	validation_1-logloss:0.01011
[13]	validation_0-logloss:0.00768	validation_1-logloss:0.00774
[14]	validation_0-logloss:0.00585	validation_1-logloss:0.00592
[15]	validation_0-logloss:0.00449	validation_1-logloss:0.00456
[1



[0]	validation_0-logloss:0.43780	validation_1-logloss:0.43782
[1]	validation_0-logloss:0.29678	validation_1-logloss:0.29681
[2]	validation_0-logloss:0.20793	validation_1-logloss:0.20797
[3]	validation_0-logloss:0.14851	validation_1-logloss:0.14855
[4]	validation_0-logloss:0.10739	validation_1-logloss:0.10744
[5]	validation_0-logloss:0.07834	validation_1-logloss:0.07840
[6]	validation_0-logloss:0.05754	validation_1-logloss:0.05760
[7]	validation_0-logloss:0.04249	validation_1-logloss:0.04256
[8]	validation_0-logloss:0.03154	validation_1-logloss:0.03161
[9]	validation_0-logloss:0.02354	validation_1-logloss:0.02361
[10]	validation_0-logloss:0.01767	validation_1-logloss:0.01774
[11]	validation_0-logloss:0.01334	validation_1-logloss:0.01342
[12]	validation_0-logloss:0.01015	validation_1-logloss:0.01024
[13]	validation_0-logloss:0.00780	validation_1-logloss:0.00789
[14]	validation_0-logloss:0.00605	validation_1-logloss:0.00614
[15]	validation_0-logloss:0.00476	validation_1-logloss:0.00485
[1



[0]	validation_0-logloss:0.43754	validation_1-logloss:0.43755
[1]	validation_0-logloss:0.29636	validation_1-logloss:0.29637
[2]	validation_0-logloss:0.20741	validation_1-logloss:0.20742
[3]	validation_0-logloss:0.14789	validation_1-logloss:0.14791
[4]	validation_0-logloss:0.10671	validation_1-logloss:0.10674
[5]	validation_0-logloss:0.07761	validation_1-logloss:0.07764
[6]	validation_0-logloss:0.05676	validation_1-logloss:0.05679
[7]	validation_0-logloss:0.04168	validation_1-logloss:0.04171
[8]	validation_0-logloss:0.03069	validation_1-logloss:0.03073
[9]	validation_0-logloss:0.02266	validation_1-logloss:0.02270
[10]	validation_0-logloss:0.01676	validation_1-logloss:0.01680
[11]	validation_0-logloss:0.01242	validation_1-logloss:0.01247
[12]	validation_0-logloss:0.00922	validation_1-logloss:0.00928
[13]	validation_0-logloss:0.00686	validation_1-logloss:0.00692
[14]	validation_0-logloss:0.00512	validation_1-logloss:0.00518
[15]	validation_0-logloss:0.00383	validation_1-logloss:0.00389
[1



[0]	validation_0-logloss:0.43754	validation_1-logloss:0.43754
[1]	validation_0-logloss:0.29636	validation_1-logloss:0.29637
[2]	validation_0-logloss:0.20741	validation_1-logloss:0.20741
[3]	validation_0-logloss:0.14789	validation_1-logloss:0.14790
[4]	validation_0-logloss:0.10671	validation_1-logloss:0.10672
[5]	validation_0-logloss:0.07761	validation_1-logloss:0.07762
[6]	validation_0-logloss:0.05675	validation_1-logloss:0.05677
[7]	validation_0-logloss:0.04167	validation_1-logloss:0.04169
[8]	validation_0-logloss:0.03068	validation_1-logloss:0.03071
[9]	validation_0-logloss:0.02265	validation_1-logloss:0.02268
[10]	validation_0-logloss:0.01675	validation_1-logloss:0.01678
[11]	validation_0-logloss:0.01241	validation_1-logloss:0.01244
[12]	validation_0-logloss:0.00921	validation_1-logloss:0.00925
[13]	validation_0-logloss:0.00685	validation_1-logloss:0.00689
[14]	validation_0-logloss:0.00511	validation_1-logloss:0.00515
[15]	validation_0-logloss:0.00381	validation_1-logloss:0.00386
[1



[0]	validation_0-logloss:0.43754	validation_1-logloss:0.43754
[1]	validation_0-logloss:0.29636	validation_1-logloss:0.29636
[2]	validation_0-logloss:0.20741	validation_1-logloss:0.20741
[3]	validation_0-logloss:0.14790	validation_1-logloss:0.14790
[4]	validation_0-logloss:0.10672	validation_1-logloss:0.10672
[5]	validation_0-logloss:0.07762	validation_1-logloss:0.07762
[6]	validation_0-logloss:0.05676	validation_1-logloss:0.05677
[7]	validation_0-logloss:0.04168	validation_1-logloss:0.04168
[8]	validation_0-logloss:0.03069	validation_1-logloss:0.03070
[9]	validation_0-logloss:0.02266	validation_1-logloss:0.02267
[10]	validation_0-logloss:0.01676	validation_1-logloss:0.01677
[11]	validation_0-logloss:0.01242	validation_1-logloss:0.01243
[12]	validation_0-logloss:0.00922	validation_1-logloss:0.00923
[13]	validation_0-logloss:0.00685	validation_1-logloss:0.00688
[14]	validation_0-logloss:0.00511	validation_1-logloss:0.00513
[15]	validation_0-logloss:0.00382	validation_1-logloss:0.00385
[1



[0]	validation_0-logloss:0.43754	validation_1-logloss:0.43754
[1]	validation_0-logloss:0.29636	validation_1-logloss:0.29636
[2]	validation_0-logloss:0.20740	validation_1-logloss:0.20741
[3]	validation_0-logloss:0.14789	validation_1-logloss:0.14789
[4]	validation_0-logloss:0.10671	validation_1-logloss:0.10671
[5]	validation_0-logloss:0.07761	validation_1-logloss:0.07761
[6]	validation_0-logloss:0.05675	validation_1-logloss:0.05676
[7]	validation_0-logloss:0.04167	validation_1-logloss:0.04168
[8]	validation_0-logloss:0.03068	validation_1-logloss:0.03069
[9]	validation_0-logloss:0.02265	validation_1-logloss:0.02266
[10]	validation_0-logloss:0.01675	validation_1-logloss:0.01677
[11]	validation_0-logloss:0.01241	validation_1-logloss:0.01243
[12]	validation_0-logloss:0.00921	validation_1-logloss:0.00923
[13]	validation_0-logloss:0.00685	validation_1-logloss:0.00687
[14]	validation_0-logloss:0.00511	validation_1-logloss:0.00513
[15]	validation_0-logloss:0.00382	validation_1-logloss:0.00384
[1



[0]	validation_0-logloss:0.43759	validation_1-logloss:0.43760
[1]	validation_0-logloss:0.29645	validation_1-logloss:0.29646
[2]	validation_0-logloss:0.20751	validation_1-logloss:0.20754
[3]	validation_0-logloss:0.14802	validation_1-logloss:0.14805
[4]	validation_0-logloss:0.10685	validation_1-logloss:0.10689
[5]	validation_0-logloss:0.07777	validation_1-logloss:0.07781
[6]	validation_0-logloss:0.05693	validation_1-logloss:0.05698
[7]	validation_0-logloss:0.04185	validation_1-logloss:0.04190
[8]	validation_0-logloss:0.03087	validation_1-logloss:0.03093
[9]	validation_0-logloss:0.02285	validation_1-logloss:0.02290
[10]	validation_0-logloss:0.01695	validation_1-logloss:0.01701
[11]	validation_0-logloss:0.01255	validation_1-logloss:0.01260
[12]	validation_0-logloss:0.00928	validation_1-logloss:0.00932
[13]	validation_0-logloss:0.00692	validation_1-logloss:0.00695
[14]	validation_0-logloss:0.00517	validation_1-logloss:0.00521
[15]	validation_0-logloss:0.00388	validation_1-logloss:0.00392
[1



[0]	validation_0-logloss:0.43759	validation_1-logloss:0.43760
[1]	validation_0-logloss:0.29644	validation_1-logloss:0.29645
[2]	validation_0-logloss:0.20750	validation_1-logloss:0.20752
[3]	validation_0-logloss:0.14801	validation_1-logloss:0.14803
[4]	validation_0-logloss:0.10684	validation_1-logloss:0.10686
[5]	validation_0-logloss:0.07775	validation_1-logloss:0.07778
[6]	validation_0-logloss:0.05678	validation_1-logloss:0.05682
[7]	validation_0-logloss:0.04170	validation_1-logloss:0.04174
[8]	validation_0-logloss:0.03072	validation_1-logloss:0.03076
[9]	validation_0-logloss:0.02269	validation_1-logloss:0.02273
[10]	validation_0-logloss:0.01680	validation_1-logloss:0.01684
[11]	validation_0-logloss:0.01246	validation_1-logloss:0.01250
[12]	validation_0-logloss:0.00926	validation_1-logloss:0.00931
[13]	validation_0-logloss:0.00688	validation_1-logloss:0.00694
[14]	validation_0-logloss:0.00514	validation_1-logloss:0.00519
[15]	validation_0-logloss:0.00385	validation_1-logloss:0.00391
[1



[0]	validation_0-logloss:0.43759	validation_1-logloss:0.43759
[1]	validation_0-logloss:0.29643	validation_1-logloss:0.29644
[2]	validation_0-logloss:0.20750	validation_1-logloss:0.20750
[3]	validation_0-logloss:0.14800	validation_1-logloss:0.14801
[4]	validation_0-logloss:0.10683	validation_1-logloss:0.10684
[5]	validation_0-logloss:0.07774	validation_1-logloss:0.07775
[6]	validation_0-logloss:0.05690	validation_1-logloss:0.05692
[7]	validation_0-logloss:0.04183	validation_1-logloss:0.04185
[8]	validation_0-logloss:0.03085	validation_1-logloss:0.03087
[9]	validation_0-logloss:0.02282	validation_1-logloss:0.02284
[10]	validation_0-logloss:0.01693	validation_1-logloss:0.01695
[11]	validation_0-logloss:0.01258	validation_1-logloss:0.01260
[12]	validation_0-logloss:0.00938	validation_1-logloss:0.00940
[13]	validation_0-logloss:0.00701	validation_1-logloss:0.00703
[14]	validation_0-logloss:0.00527	validation_1-logloss:0.00529
[15]	validation_0-logloss:0.00397	validation_1-logloss:0.00400
[1



[0]	validation_0-logloss:0.43757	validation_1-logloss:0.43757
[1]	validation_0-logloss:0.29641	validation_1-logloss:0.29641
[2]	validation_0-logloss:0.20746	validation_1-logloss:0.20747
[3]	validation_0-logloss:0.14796	validation_1-logloss:0.14797
[4]	validation_0-logloss:0.10679	validation_1-logloss:0.10680
[5]	validation_0-logloss:0.07769	validation_1-logloss:0.07771
[6]	validation_0-logloss:0.05685	validation_1-logloss:0.05686
[7]	validation_0-logloss:0.04177	validation_1-logloss:0.04179
[8]	validation_0-logloss:0.03080	validation_1-logloss:0.03082
[9]	validation_0-logloss:0.02277	validation_1-logloss:0.02279
[10]	validation_0-logloss:0.01687	validation_1-logloss:0.01690
[11]	validation_0-logloss:0.01254	validation_1-logloss:0.01256
[12]	validation_0-logloss:0.00934	validation_1-logloss:0.00937
[13]	validation_0-logloss:0.00698	validation_1-logloss:0.00701
[14]	validation_0-logloss:0.00524	validation_1-logloss:0.00527
[15]	validation_0-logloss:0.00395	validation_1-logloss:0.00398
[1



[0]	validation_0-logloss:0.43775	validation_1-logloss:0.43775
[1]	validation_0-logloss:0.29669	validation_1-logloss:0.29668
[2]	validation_0-logloss:0.20781	validation_1-logloss:0.20781
[3]	validation_0-logloss:0.14836	validation_1-logloss:0.14835
[4]	validation_0-logloss:0.10723	validation_1-logloss:0.10722
[5]	validation_0-logloss:0.07816	validation_1-logloss:0.07816
[6]	validation_0-logloss:0.05735	validation_1-logloss:0.05735
[7]	validation_0-logloss:0.04229	validation_1-logloss:0.04229
[8]	validation_0-logloss:0.03133	validation_1-logloss:0.03134
[9]	validation_0-logloss:0.02332	validation_1-logloss:0.02333
[10]	validation_0-logloss:0.01744	validation_1-logloss:0.01746
[11]	validation_0-logloss:0.01312	validation_1-logloss:0.01313
[12]	validation_0-logloss:0.00993	validation_1-logloss:0.00995
[13]	validation_0-logloss:0.00758	validation_1-logloss:0.00760
[14]	validation_0-logloss:0.00584	validation_1-logloss:0.00587
[15]	validation_0-logloss:0.00455	validation_1-logloss:0.00458
[1



[0]	validation_0-logloss:0.43781	validation_1-logloss:0.43781
[1]	validation_0-logloss:0.29679	validation_1-logloss:0.29678
[2]	validation_0-logloss:0.20794	validation_1-logloss:0.20793
[3]	validation_0-logloss:0.14851	validation_1-logloss:0.14850
[4]	validation_0-logloss:0.10739	validation_1-logloss:0.10738
[5]	validation_0-logloss:0.07835	validation_1-logloss:0.07834
[6]	validation_0-logloss:0.05754	validation_1-logloss:0.05753
[7]	validation_0-logloss:0.04249	validation_1-logloss:0.04248
[8]	validation_0-logloss:0.03154	validation_1-logloss:0.03154
[9]	validation_0-logloss:0.02353	validation_1-logloss:0.02353
[10]	validation_0-logloss:0.01766	validation_1-logloss:0.01766
[11]	validation_0-logloss:0.01333	validation_1-logloss:0.01334
[12]	validation_0-logloss:0.01014	validation_1-logloss:0.01016
[13]	validation_0-logloss:0.00779	validation_1-logloss:0.00781
[14]	validation_0-logloss:0.00605	validation_1-logloss:0.00608
[15]	validation_0-logloss:0.00477	validation_1-logloss:0.00480
[1



[0]	validation_0-logloss:0.43786	validation_1-logloss:0.43786
[1]	validation_0-logloss:0.29686	validation_1-logloss:0.29687
[2]	validation_0-logloss:0.20803	validation_1-logloss:0.20804
[3]	validation_0-logloss:0.14861	validation_1-logloss:0.14863
[4]	validation_0-logloss:0.10750	validation_1-logloss:0.10753
[5]	validation_0-logloss:0.07846	validation_1-logloss:0.07849
[6]	validation_0-logloss:0.05766	validation_1-logloss:0.05770
[7]	validation_0-logloss:0.04261	validation_1-logloss:0.04266
[8]	validation_0-logloss:0.03167	validation_1-logloss:0.03172
[9]	validation_0-logloss:0.02366	validation_1-logloss:0.02373
[10]	validation_0-logloss:0.01779	validation_1-logloss:0.01785
[11]	validation_0-logloss:0.01346	validation_1-logloss:0.01354
[12]	validation_0-logloss:0.01028	validation_1-logloss:0.01036
[13]	validation_0-logloss:0.00793	validation_1-logloss:0.00801
[14]	validation_0-logloss:0.00619	validation_1-logloss:0.00628
[15]	validation_0-logloss:0.00491	validation_1-logloss:0.00500
[1



[0]	validation_0-logloss:0.43793	validation_1-logloss:0.43793
[1]	validation_0-logloss:0.29697	validation_1-logloss:0.29698
[2]	validation_0-logloss:0.20818	validation_1-logloss:0.20818
[3]	validation_0-logloss:0.14878	validation_1-logloss:0.14879
[4]	validation_0-logloss:0.10769	validation_1-logloss:0.10770
[5]	validation_0-logloss:0.07867	validation_1-logloss:0.07868
[6]	validation_0-logloss:0.05788	validation_1-logloss:0.05789
[7]	validation_0-logloss:0.04285	validation_1-logloss:0.04286
[8]	validation_0-logloss:0.03191	validation_1-logloss:0.03193
[9]	validation_0-logloss:0.02391	validation_1-logloss:0.02394
[10]	validation_0-logloss:0.01804	validation_1-logloss:0.01807
[11]	validation_0-logloss:0.01372	validation_1-logloss:0.01375
[12]	validation_0-logloss:0.01054	validation_1-logloss:0.01057
[13]	validation_0-logloss:0.00819	validation_1-logloss:0.00822
[14]	validation_0-logloss:0.00645	validation_1-logloss:0.00649
[15]	validation_0-logloss:0.00506	validation_1-logloss:0.00509
[1



[0]	validation_0-logloss:0.43777	validation_1-logloss:0.43775
[1]	validation_0-logloss:0.29672	validation_1-logloss:0.29670
[2]	validation_0-logloss:0.20785	validation_1-logloss:0.20783
[3]	validation_0-logloss:0.14841	validation_1-logloss:0.14838
[4]	validation_0-logloss:0.10728	validation_1-logloss:0.10726
[5]	validation_0-logloss:0.07823	validation_1-logloss:0.07821
[6]	validation_0-logloss:0.05742	validation_1-logloss:0.05739
[7]	validation_0-logloss:0.04237	validation_1-logloss:0.04235
[8]	validation_0-logloss:0.03142	validation_1-logloss:0.03140
[9]	validation_0-logloss:0.02341	validation_1-logloss:0.02339
[10]	validation_0-logloss:0.01754	validation_1-logloss:0.01752
[11]	validation_0-logloss:0.01322	validation_1-logloss:0.01320
[12]	validation_0-logloss:0.00999	validation_1-logloss:0.00998
[13]	validation_0-logloss:0.00750	validation_1-logloss:0.00751
[14]	validation_0-logloss:0.00569	validation_1-logloss:0.00570
[15]	validation_0-logloss:0.00436	validation_1-logloss:0.00437
[1



[0]	validation_0-logloss:0.43780	validation_1-logloss:0.43781
[1]	validation_0-logloss:0.29678	validation_1-logloss:0.29680
[2]	validation_0-logloss:0.20794	validation_1-logloss:0.20796
[3]	validation_0-logloss:0.14851	validation_1-logloss:0.14854
[4]	validation_0-logloss:0.10739	validation_1-logloss:0.10743
[5]	validation_0-logloss:0.07835	validation_1-logloss:0.07839
[6]	validation_0-logloss:0.05755	validation_1-logloss:0.05759
[7]	validation_0-logloss:0.04250	validation_1-logloss:0.04255
[8]	validation_0-logloss:0.03156	validation_1-logloss:0.03162
[9]	validation_0-logloss:0.02355	validation_1-logloss:0.02362
[10]	validation_0-logloss:0.01768	validation_1-logloss:0.01775
[11]	validation_0-logloss:0.01335	validation_1-logloss:0.01343
[12]	validation_0-logloss:0.01013	validation_1-logloss:0.01021
[13]	validation_0-logloss:0.00766	validation_1-logloss:0.00771
[14]	validation_0-logloss:0.00584	validation_1-logloss:0.00588
[15]	validation_0-logloss:0.00454	validation_1-logloss:0.00459
[1



[0]	validation_0-logloss:0.43786	validation_1-logloss:0.43788
[1]	validation_0-logloss:0.29686	validation_1-logloss:0.29689
[2]	validation_0-logloss:0.20803	validation_1-logloss:0.20807
[3]	validation_0-logloss:0.14861	validation_1-logloss:0.14867
[4]	validation_0-logloss:0.10751	validation_1-logloss:0.10757
[5]	validation_0-logloss:0.07847	validation_1-logloss:0.07854
[6]	validation_0-logloss:0.05768	validation_1-logloss:0.05775
[7]	validation_0-logloss:0.04264	validation_1-logloss:0.04272
[8]	validation_0-logloss:0.03169	validation_1-logloss:0.03178
[9]	validation_0-logloss:0.02370	validation_1-logloss:0.02379
[10]	validation_0-logloss:0.01783	validation_1-logloss:0.01793
[11]	validation_0-logloss:0.01351	validation_1-logloss:0.01362
[12]	validation_0-logloss:0.01031	validation_1-logloss:0.01043
[13]	validation_0-logloss:0.00796	validation_1-logloss:0.00807
[14]	validation_0-logloss:0.00618	validation_1-logloss:0.00630
[15]	validation_0-logloss:0.00477	validation_1-logloss:0.00487
[1



[0]	validation_0-logloss:0.43792	validation_1-logloss:0.43792
[1]	validation_0-logloss:0.29696	validation_1-logloss:0.29697
[2]	validation_0-logloss:0.20816	validation_1-logloss:0.20816
[3]	validation_0-logloss:0.14876	validation_1-logloss:0.14877
[4]	validation_0-logloss:0.10767	validation_1-logloss:0.10768
[5]	validation_0-logloss:0.07865	validation_1-logloss:0.07866
[6]	validation_0-logloss:0.05786	validation_1-logloss:0.05787
[7]	validation_0-logloss:0.04283	validation_1-logloss:0.04283
[8]	validation_0-logloss:0.03189	validation_1-logloss:0.03189
[9]	validation_0-logloss:0.02390	validation_1-logloss:0.02390
[10]	validation_0-logloss:0.01802	validation_1-logloss:0.01803
[11]	validation_0-logloss:0.01370	validation_1-logloss:0.01371
[12]	validation_0-logloss:0.01051	validation_1-logloss:0.01052
[13]	validation_0-logloss:0.00815	validation_1-logloss:0.00817
[14]	validation_0-logloss:0.00637	validation_1-logloss:0.00640
[15]	validation_0-logloss:0.00498	validation_1-logloss:0.00502
[1

In [None]:
all_results2 = pd.concat([results2[i] for i in results2], axis = 0)

all_results2
#all_results2.to_csv('data_set_test/all_models_result2.csv')