# Finding the right Estimator

In this notebook, we examine how different estimators perform in our prediction pipeline. First, we load the required dependencies and the data.

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders.one_hot import OneHotEncoder


from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

In [None]:
# load the data
train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()
results = {}

***
## 1) Pointwise Regression

In [None]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_NORMALIZED_REGRESSION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=1,
    target="rank"
)
scores = {}

### 1.1) Baseline

In [None]:
from sklearn.dummy import DummyRegressor

pipeline.change_estimator(DummyRegressor())
pipeline.run()

scores['baseline'] = pipeline._validation_performance_scores

### 1.2) Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

pipeline.change_estimator(LinearRegression())
pipeline.run()

scores['linear_regression'] = pipeline._validation_performance_scores

### 1.3) SVMs

In [None]:
from sklearn.svm import SVR

pipeline.change_estimator(SVR())
pipeline.run()

scores['svm'] = pipeline._validation_performance_scores

### 1.4) K-Nearest-Neighbours

In [None]:
from sklearn.neighbors import KNeighborsRegressor

pipeline.change_estimator(KNeighborsRegressor())
pipeline.run()

scores['knn'] = pipeline._validation_performance_scores

### 1.5) Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor

pipeline.change_estimator(DecisionTreeRegressor())
pipeline.run()

scores['decision_tree'] = pipeline._validation_performance_scores

### 1.6) Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestRegressor

pipeline.change_estimator(RandomForestRegressor())
pipeline.run()

scores['random_forest'] = pipeline._validation_performance_scores

### 1.7) Light Gradient Boosting Machine

In [None]:
from lightgbm import LGBMRegressor

pipeline.change_estimator(LGBMRegressor())
pipeline.run()

scores['lgbm'] = pipeline._validation_performance_scores

### 1.8) Neural Networks

In [None]:
from sklearn.neural_network import MLPRegressor

pipeline.change_estimator(MLPRegressor())
pipeline.run()

scores['neural_network'] = pipeline._validation_performance_scores

In [None]:
scores_df = pd.DataFrame(scores)
scores_df = scores_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
scores_df.index = scores_df.index.str.replace('validation_', '')
scores_df = scores_df.rename_axis('metric')

scores_df = scores_df.mean()
display(scores_df.sort_values(ascending=False))
sns.barplot(x=scores_df, y=scores_df.index)
plt.title('pairwise_regression')
plt.xlabel('average_spearman')
plt.gcf().subplots_adjust(left=0.25)
plt.savefig('pairwise_regression_estimator_comparison.svg')
plt.show()

***
## 2) Pointwise Classification

In [None]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=1,
    target="rank"
)
scores = {}

### 2.1) Baseline

In [None]:
from sklearn.dummy import DummyClassifier

pipeline.change_estimator(DummyClassifier())
pipeline.run()

scores['baseline'] = pipeline._validation_performance_scores

### 2.2) SVMs

In [None]:
from sklearn.svm import SVC

pipeline.change_estimator(SVC())
pipeline.run()

scores['svm'] = pipeline._validation_performance_scores

### 2.3) K-Nearest-Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipeline.change_estimator(KNeighborsClassifier())
pipeline.run()

scores['knn'] = pipeline._validation_performance_scores

### 2.4) Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipeline.change_estimator(DecisionTreeClassifier())
pipeline.run()

scores['decision_tree'] = pipeline._validation_performance_scores

### 2.5) Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline.change_estimator(RandomForestClassifier())
pipeline.run()

scores['random_forest'] = pipeline._validation_performance_scores

### 2.6) Light Gradient Boosting Machine

In [None]:
from lightgbm import LGBMClassifier

pipeline.change_estimator(LGBMClassifier())
pipeline.run()

scores['lgbm'] = pipeline._validation_performance_scores

### 2.7) Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

pipeline.change_estimator(MLPClassifier())
pipeline.run()

scores['neural_network'] = pipeline._validation_performance_scores

In [None]:
scores_df = pd.DataFrame(scores)
scores_df = scores_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
scores_df.index = scores_df.index.str.replace('validation_', '')
scores_df = scores_df.rename_axis('metric')

scores_df = scores_df.mean()
display(scores_df.sort_values(ascending=False))
sns.barplot(x=scores_df, y=scores_df.index)
plt.title('pairwise_classification')
plt.xlabel('average_spearman')
plt.gcf().subplots_adjust(left=0.25)
plt.savefig('pairwise_classification_estimator_comparison.svg')
plt.show()

***
## 3) Pointwise Ordinal Regression

In [None]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_ORDINAL_REGRESSION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=1,
    target="rank"
)
scores = {}

### 3.1) Baseline

In [None]:
from sklearn.dummy import DummyClassifier

pipeline.change_estimator(DummyClassifier())
pipeline.run()

scores['baseline'] = pipeline._validation_performance_scores

### 3.2) SVMs

In [None]:
from sklearn.svm import SVC

pipeline.change_estimator(SVC())
pipeline.run()

scores['svm'] = pipeline._validation_performance_scores

### 3.3) K-Nearest-Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipeline.change_estimator(KNeighborsClassifier())
pipeline.run()

scores['knn'] = pipeline._validation_performance_scores

### 3.4) Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipeline.change_estimator(DecisionTreeClassifier())
pipeline.run()

scores['decision_tree'] = pipeline._validation_performance_scores

### 3.5) Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline.change_estimator(RandomForestClassifier())
pipeline.run()

scores['random_forest'] = pipeline._validation_performance_scores

### 3.6) Light Gradient Boosting Machine

In [None]:
from lightgbm import LGBMClassifier

pipeline.change_estimator(LGBMClassifier())
pipeline.run()

scores['lgbm'] = pipeline._validation_performance_scores

### 3.7) Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

pipeline.change_estimator(MLPClassifier())
pipeline.run()

scores['neural_network'] = pipeline._validation_performance_scores

In [None]:
scores_df = pd.DataFrame(scores)
scores_df = scores_df.applymap(lambda x: x[0] if isinstance(x, list) else x)
scores_df.index = scores_df.index.str.replace('validation_', '')
scores_df = scores_df.rename_axis('metric')

scores_df = scores_df.mean()
display(scores_df.sort_values(ascending=False))
sns.barplot(x=scores_df, y=scores_df.index)
plt.title('pairwise_ordinal_regression')
plt.xlabel('average_spearman')
plt.gcf().subplots_adjust(left=0.25)
plt.savefig('pairwise_ordinal_regression_estimator_comparison.svg')
plt.show()