# Pointwise Classification

In this notebook, we examine the pointwise classification approach, including tuning and trying different estimators. First, we load the required dependencies and the data.

In [4]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

In [5]:
# group by the columns: model, tuning, scoring
# for each group print out the unique values of the rank column
grouped_df = train_df.groupby(["model", "tuning", "scoring"])["rank"].value_counts()
print(grouped_df)
# save the grouped dataframe to a csv file
path = os.path.join(config.FIGURE_DIR, "pointwise_ranking.csv")
grouped_df.to_csv(path)

model  tuning  scoring  rank
DTC    full    ACC      0.0     135
                        1.0      71
                        2.0      50
                        3.0      43
                        5.0      39
                               ... 
SVC    no      F1       27.0      9
                        28.0      7
                        29.0      6
                        30.0      5
                        31.0      2
Name: count, Length: 1146, dtype: int64


## Pointwise Classification

In [6]:
start = time()

# running the pipeline plain wihout parameter tuning using cross validation
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=16,
    target="rank"
)

#pipeline.add_new_step(PrintDataframe(verbose=1), "print_dataframe_1")
print(pipeline.get_pipeline().named_steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
{'keeper': ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder']), 'encoder_transformer': PoincareEmbedding(batch_size=50, encoder=OneHotEncoder(), epochs=500,
                  graph=<networkx.classes.graph.Graph object at 0x00000216B2600C40>,
                  size=3), 'dataset_transformer': OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25), 'general_transformer': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=TargetEncoder(),
                                 tuning_encoder=TargetEncoder()), 'estimator': DecisionTreeClassifier()}
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:46<00:00,  9.34s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.6486
    validation_average_spearman_fold_1: 0.6103
    validation_average_spearman_fold_2: 0.6211
    validation_average_spearman_fold_3: 0.6254
    validation_average_spearman_fold_4: 0.636
    average of all folds: 0.6283 [std=0.0131]

runtime: 0:00:54 [54s]





### Tuning with Bayes Search

In [3]:
start = time()

# number of optimization rounds = n_iter / n_points (e.g. 50 rounds in our case)
n_iter = 200 # how many unique parameters to examine - our default: 200
n_points = 4 # how many unique parameter combinations per optimization round - our default: 4
cv = 4 # how many fits for each unique parameter combination - our default: 4
n_jobs = -1 # how many fits in parallel (only parallelizable per round) - our default: -1

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_BAYES_SEARCH,
    verbose_level=1,
    target="rank",
    bayes_n_iter=n_iter,
    bayes_n_points=n_points,
    bayes_cv=cv,
    bayes_n_jobs=n_jobs
)

pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
Starting pipeline using method: EvaluationType.BAYES_SEARCH
Performing bayes search
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:50<00:00, 10.16s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.6463 [std=0.]
    validation_average_spearman_fold_1: 0.6021 [std=0.]
    validation_average_spearman_fold_2: 0.6093 [std=0.]
    validation_average_spearman_fold_3: 0.6285 [std=0.]
    validation_average_spearman_fold_4: 0.6012 [std=0.]
    average_spearman (5-fold): 0.6175 [std=0.0174]

runtime: 1:24:25 [5065s]





## Runtime on Server Infrastructure

In [None]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *

train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

start = time()

# running the pipeline plain wihout parameter tuning using cross validation
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=16,
    target="rank"
)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

```
Creating pipeline ...
Starting pipeline using method: EvaluationType.CROSS_VALIDATION
Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.6354
    validation_average_spearman_fold_1: 0.6061
    validation_average_spearman_fold_2: 0.618
    validation_average_spearman_fold_3: 0.6298
    validation_average_spearman_fold_4: 0.6323
    average of all folds: 0.6243 [std=0.0108]

runtime: 0:00:51 [51s]

```