# Building pipeline

In [None]:
import numpy as np
import pandas as pd

dataset = pd.read_csv("../api/data/emotional_monitoring_dataset_with_target.csv")
dataset["EngagementLevel"]= dataset["EngagementLevel"].map({1: "Disengaged", 2: "Moderately Engaged", 3: "Highly Engaged"})

## PreProcessor

It will process NA values if that will happen.

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreProcessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y = None, **fit_params):
        self.mean_heart_rate = X["HeartRate"].mean()
        self.mean_skin_conductance = X["SkinConductance"].mean()
        self.mean_eeg = X["EEG"].mean()
        self.mean_temperature = X["Temperature"].mean()
        self.mean_pupil_diameter = X["PupilDiameter"].mean()
        self.mean_smile_intensity = X["SmileIntensity"].mean()
        self.mean_frown_intensity = X["FrownIntensity"].mean()
        self.mean_cortisol_level = X["CortisolLevel"].mean()
        self.mean_activity_level = X["ActivityLevel"].mean()
        self.mean_noise_level = X["AmbientNoiseLevel"].mean()
        self.mean_light_level = X["LightingLevel"].mean()
        return self

    def transform(self, X):
        with option_context("future.no_silent_downcasting", True):
            X["HeartRate"] = X["HeartRate"].fillna(self.mean_heart_rate).infer_objects(copy = False)
            X["SkinConductance"] = X["SkinConductance"].fillna(self.mean_skin_conductance).infer_objects(copy = False)
            X["EEG"] = X["EEG"].fillna(self.mean_eeg).infer_objects(copy = False)
            X["Temperature"] = X["Temperature"].fillna(self.mean_temperature).infer_objects(copy = False)
            X["PupilDiameter"] = X["PupilDiameter"].fillna(self.mean_pupil_diameter).infer_objects(copy = False)
            X["SmileIntensity"] = X["SmileIntensity"].fillna(self.mean_smile_intensity).infer_objects(copy = False)
            X["FrownIntensity"] = X["FrownIntensity"].fillna(self.mean_frown_intensity).infer_objects(copy = False)
            X["CortisolLevel"] = X["CortisolLevel"].fillna(self.mean_cortisol_level).infer_objects(copy = False)
            X["ActivityLevel"] = X["ActivityLevel"].fillna(self.mean_activity_level).infer_objects(copy = False)
            X["AmbientNoiseLevel"] = X["AmbientNoiseLevel"].fillna(self.mean_noise_level).infer_objects(copy = False)
            X["LightingLevel"] = X["LightingLevel"].fillna(self.mean_light_level).infer_objects(copy = False)
        return X

## Feature Selection

We will use Beam Search algorithm for feature selection.

Inside we will use RandomForest for optimization.

As the criterion of sorting we will use K-fold Cross Validation score.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

class BeamSearch(BaseEstimator, TransformerMixin):
    def __init__(self, beam = 5, folds = 5, gb_estimators = 25, stop_criterion = 2):
        self.beam = beam
        self.folds = folds
        self.gb_estimators = gb_estimators
        self.stop_criterion = stop_criterion

    def fit(self, X, y):
        n_features = len(X.columns)
        R = [[i] for i in range(n_features)]
        best_score = 0
        best_features = []
        best_dim = 0
        for j in range(n_features):
            cv_score = [self.evaluate_score(X, y, J) for J in R]
            sorted_cv_score, sorted_R = zip(*[(b, a) for b, a in sorted(zip(cv_score, R), reverse = True)])
            R = list(sorted_R[0:self.beam])
            if sorted_cv_score[0] > best_score:
                best_score = sorted_cv_score[0]
                best_features = R[0]
                best_dim = j
            if j - best_dim >= self.stop_criterion:
                self.best_score = best_score
                self.best_features = best_features
                return self
            for i in range(len(R)):
                J = R.pop(0)
                for f in list(set(range(n_features)) - set(J)):
                    R.append([*J, f])
        self.best_score = best_score
        self.best_features = best_features
        return self

    def evaluate_score(self, X, y, J):
        cols = X.columns.values.tolist()
        subcols = [cols[i] for i in J]
        X_sub = X[subcols]
        gb = GradientBoostingClassifier(n_estimators = self.gb_estimators)
        return sum(cross_val_score(gb, X_sub, y, cv = self.folds, scoring = "f1_macro"))

    def transform(self, X):
        cols = X.columns.values.tolist()
        subcols = [cols[i] for i in self.best_features]
        X_sub = X[subcols]
        return X_sub

## Making Pipeline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

pipe = make_pipeline(
    PreProcessor(),
    BeamSearch(),
    GradientBoostingClassifier()
)
pipe

In [None]:
X = dataset.drop(["EmotionalState", "CognitiveState", "EngagementLevel"], axis = 1)
y = dataset["EngagementLevel"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 777)

In [None]:
from sklearn.model_selection import GridSearchCV

#boosting_params = {
#        "gradientboostingclassifier__n_estimators": [20, 25, 30],
#        "beamsearch__beam": [4, 5, 6],
#        "beamsearch__gb_estimators": [20, 25, 30]
#    }

boosting_params = {
        "gradientboostingclassifier__n_estimators": [20],
        "beamsearch__beam": [2],
        "beamsearch__gb_estimators": [20]
    }

grid = GridSearchCV(pipe, param_grid = boosting_params, scoring = "f1_macro")
grid.fit(X_train, y_train)

In [None]:
f1_score(y_test, grid.predict(X_test), average = "macro")

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
features_indexes = grid.best_estimator_["beamsearch"].best_features
cols = X.columns.values.tolist()
for i in features_indexes:
    print(cols[i])