In [1]:
!pip install wget
!pip install catboost



In [2]:
import os,sys

from google.colab import drive
drive.mount('/content/drive')
sys.path.append("/content/drive/My Drive/Collab/collab_v1")
%cd "/content/drive/My Drive/Collab/collab_v1"

ModuleNotFoundError: No module named 'google.colab'

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

#some plotting routines
import plotting

# data processing
import data_higgs as dh

from catboost import CatBoostClassifier, Pool
# from google.colab import output
# output.enable_custom_widget_manager()

In [2]:
#--------- globals

# Need to see a large portion of the data before we can build a layer, for
# example half of data n_batches_per_layer =  NBATCH_FRAC * NUM_EXAMPLES / BATCH_SIZE
BATCH_SIZE = 1000

# Seed value
# Apparently you may use different seed values at each stage
SEED_VALUE= 10001
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(SEED_VALUE)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(SEED_VALUE)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(SEED_VALUE)
# # 4. Set `tensorflow` pseudo-random generator at a fixed value
# tf.random.set_seed(SEED_VALUE)

In [3]:
#-------- routines

def split_xy(rawdata):
    #split features and labels from data
    #prepare the data => normalizations !

    # split
    data_y=rawdata['hlabel'] # labels only: 0.=bkg, 1.=sig
    data_x=rawdata.drop(['hlabel'], axis=1) # features only

    #now prepare the data
    mu = data_x.mean()
    s = data_x.std()
    dmax = data_x.max()
    dmin = data_x.min()

    # normal/standard rescaling
    data_x = (data_x - mu)/s

    #scaling to [-1,1] range
    #data_x = -1. + 2.*(data_x - dmin)/(dmax-dmin)

    # scaling to [0,1] range
    #data_x = (data_x - dmin)/(dmax-dmin)


    return data_x,data_y

In [4]:
def main(nepoch=100):
        # Training progress and eval results are shown as logging.INFO; so enables it.

    #needs to be done once if the h5 files need to be recreated...
    # dh.download_and_make_data()

    hdata=dh.load_data()
    data_fnames=hdata['feature_names'].to_numpy()[1:] #labels not needed
    n_dims=data_fnames.shape[0]
    print ("Entries read {} with feature names {}".format(n_dims,data_fnames))

    x_trn,y_trn=split_xy(hdata['train']) # training sample, should split a fraction for testing
    x_train, x_test,y_train, y_test = train_test_split(x_trn,y_trn,test_size=0.1) # 10% split
    x_val,y_val=split_xy(hdata['valid']) # independent cross-valid sample

    print("Shapes train:{} and test:{}".format(x_train.shape,x_test.shape))

    #plot distributions
    plotting.plot_sig_bkg_from_np_arrays(x_train.to_numpy(),y_train.to_numpy(),data_fnames,logy=False)

    # ready the data for Catboost
    pool_train = Pool(data=x_train.to_numpy(),label=y_train.to_numpy(),feature_names=data_fnames.tolist())
    pool_test = Pool(data=x_test.to_numpy(),label=y_test.to_numpy(),feature_names=data_fnames.tolist())

    #NaKratko .... !

    # CatBoost parameters
    eval_metric = 'AUC' # see https://catboost.ai/docs/concepts/loss-functions-classification.html
    task_type = 'CPU'  # if GPU else 'CPU'
    max_number_of_trees = nepoch
    #eval_metric = 'Accuracy'
    bdt=CatBoostClassifier(
        verbose=True,
        task_type=task_type,
        loss_function='Logloss', # see values same as in eval_metric, e.g. CrossEntropy, Logloss is default
        iterations=max_number_of_trees,
        eval_metric=eval_metric,
        learning_rate=0.01,
        max_depth=6,
        use_best_model=False,
        random_seed=SEED_VALUE,
        )

    #now train the model
    bdt.fit(
            pool_train,
            early_stopping_rounds=20,
            #is early stopping relevant?
            eval_set=pool_test,
            plot=True # works when called through Ipython/jupyter
        )


    print()
    #plot & print results like ROC and score distribution etc...
    y_score=bdt.predict_proba(x_val.to_numpy())[:,1]
    print("score shape {}",y_score.shape)
    plotting.plot_roc(y_val,y_score)
    plotting.plot_score(y_val,y_score)
    auc=roc_auc_score(y_val,y_score)
    print("AUC score: {}".format(auc))

In [6]:
main(50)

Loading /home/luka/Documents/Programming/school/mp/ml/collab_v1/data/higgs-parsed/higgs-parsed.h5...
Loaded.
Entries read 28 with feature names ['lepton-pT' 'lepton-eta' 'lepton-phi' 'missing-energy'
 'missing-energy-phi' 'jet_1-pt' 'jet_1-eta' 'jet_1-phi' 'jet_1-b-tag'
 'jet_2-pt' 'jet_2-eta' 'jet_2-phi' 'jet_2-b-tag' 'jet_3-pt' 'jet_3-eta'
 'jet_3-phi' 'jet_3-b-tag' 'jet_4-pt' 'jet_4-eta' 'jet_4-phi'
 'jet_4-b-tag' 'm_jj' 'm_jjj' 'm_lv' 'm_jlv' 'm_bb' 'm_wbb' 'm_wwbb']
Shapes train:(360000, 28) and test:(40000, 28)


ValueError: num must be an integer with 1 <= num <= 18, not 19

Error in callback <function _draw_all_if_interactive at 0x7f46aadf3380> (for post_execute), with arguments args (),kwargs {}:


ValueError: Image size of 36000000x1000 pixels is too large. It must be less than 2^23 in each direction.

ValueError: Image size of 36000000x1000 pixels is too large. It must be less than 2^23 in each direction.

<Figure size 3.6e+07x1000 with 18 Axes>