# Pointwise Methods

In this notebook, we compare pointwise methods and apply search methods to find the best performing one. First, we load the required dependencies and the data.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *
from sklearn.compose import ColumnTransformer


# load the data
train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

## Pointwise Regression

In [9]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank"
)
print(pipeline.get_pipeline().named_steps)
pipeline.run()

Creating pipeline ...
{'keeper': ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder']), 'encoder_transformer': PoincareEmbedding(batch_size=50, encoder=OneHotEncoder(), epochs=500,
                  graph=<networkx.classes.graph.Graph object at 0x00000286AC3EF610>,
                  size=3), 'dataset_transformer': OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25), 'general_transformer': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=TargetEncoder(),
                                 tuning_encoder=TargetEncoder()), 'estimator': DecisionTreeRegressor()}
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [00:30<00:00,  6.14s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.7653
    validation_average_spearman_fold_1: 0.7621
    validation_average_spearman_fold_2: 0.7324
    validation_average_spearman_fold_3: 0.7901
    validation_average_spearman_fold_4: 0.7757
    average of all folds: 0.7651 [std=0.0191]





In [None]:
param_grid = {
    "encoder_transformer__batch_size": [50, 500],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

## Pointwise Classification

In [8]:
# running the pipeline plain wihout parameter tuning using cross validation
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=16,
    target="rank"
)

#pipeline.add_new_step(PrintDataframe(verbose=1), "print_dataframe_1")
print(pipeline.get_pipeline().named_steps)
pipeline.run()

Creating pipeline ...
{'keeper': ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder']), 'encoder_transformer': PoincareEmbedding(batch_size=50, encoder=OneHotEncoder(), epochs=500,
                  graph=<networkx.classes.graph.Graph object at 0x00000286A934FA90>,
                  size=3), 'dataset_transformer': OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25), 'general_transformer': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=TargetEncoder(),
                                 tuning_encoder=TargetEncoder()), 'estimator': DecisionTreeClassifier()}
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [00:30<00:00,  6.11s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.7669
    validation_average_spearman_fold_1: 0.7061
    validation_average_spearman_fold_2: 0.7036
    validation_average_spearman_fold_3: 0.7566
    validation_average_spearman_fold_4: 0.7458
    average of all folds: 0.7358 [std=0.0262]





We get a very good score of > 0.7. Let's try to optimize the score

In [None]:
param_grid = {
    "encoder_transformer__batch_size": [50, 500],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

## Pointwise Ordinal Regression

In [None]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_ORDINAL_REGRESSION_NO_SEARCH,
    verbose_level=1,
    n_folds=5,
    workers=1,
    target="rank"
)

pipeline.run()

In [None]:
param_grid = {
    "encoder_transformer__batch_size": [50, 500],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_ORDINAL_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

## Comparison