# Ordinal and Monotonic Classification

Alumno: Ignacio Sánchez Herrera

## Ejercicio 1
### Multiple Model for Ordinal Classification

En este apartado compararemos tres modelos distintos empleando
el enfoque Múltiple Ordinal y sin emplearlo.

In [5]:
# Imports
from sklearn.base import ClassifierMixin, BaseEstimator, clone
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
# Clase para transformar cualquier modelo en un clasificador ordinal.
class MultipleModelOrdinalClassifier(BaseEstimator,ClassifierMixin):
    # I inherit from these classes so that the class is compatible with the sklearn API.
    def __init__(self, classifier):
        """
        Parameters
        ----------
        :classifier: classifier from which to build the model
        """
        self.classifier = classifier
        self.fitted_classifiers = []

    def _fit(self,X,y):
        #We keep the labels and put them in order
        self.labels = np.sort(y.unique())

        #Iterate through all the labels except the last one
        for i,label in enumerate(self.labels[:-1]):
            smaller_labels = self.labels[:i+1] #We make the set of labels <= than the current one
            greater_labels = self.labels[i+1:] #Makes the set of labels > than the current one

            #We build two dictionaries to replace the label values in the dataset.
            smaller_replacements = { label:0 for label in smaller_labels} 
            greater_replacements = { label:1 for label in greater_labels} 
            smaller_replacements.update(greater_replacements)

            y_i = y.replace(smaller_replacements)

            #We create a classifier and fit it to the dataset on the replaced labels.
            classifier = clone(self.classifier)
            classifier.fit(X,y_i)
            #We put the classifier in the fitter_classifiers
            self.fitted_classifiers.append(classifier)
    
    def _predict(self,X):
        predictions_greater = []
        
        #For each classifier we draw predictions in the form of probabilities.
        for cl in self.fitted_classifiers:
            prediction = cl.predict_proba(X)[:,1] #We will stick with the second probability, i.e. with P(target > Vi)
            predictions_greater.append(prediction)

        #We compute the probabilities of the first and last class
        primera = 1 - predictions_greater[0] #first
        ultima = predictions_greater[-1] # last
        probabilidades_clase = [primera]
        #We calculate the remaining probabilities 
        for i,pred in enumerate(predictions_greater):
            if i != 0 and i != len(predictions_greater): #We avoid going through first and last
                prob_i = predictions_greater[i-1]*(1-predictions_greater[i])
                probabilidades_clase.append(prob_i)

        #We insert the last probability so that they are in order in the array.
        probabilidades_clase.append(ultima)

        #We take the index of the probability with the highest value
        predictions = np.argmax(probabilidades_clase,axis=0)
        #We replace the indexes with the actual value of the label
        final_preds = pd.Series(predictions).replace({i:label for i,label in enumerate(self.labels)})

        return final_preds

    def get_modelos(self):
        return self.fitted_classifiers

    def fit(self, X, y):
        """
        Fit the model to the data
        Parameters
        ----------
        X predictor variables
        y predictor variable
        """
        self._fit(X,y)
    def predict(self, X) -> pd.Series:
        """
        Makes a prediction on a set of predictor variables
        Parameters
        ----------
        X predictor variables

        Returns
        ----------
        labels labels predicted by the model
        """
        return self._predict(X)

#### Carga de datos

In [7]:
data = pd.read_csv("rating.csv", delimiter=";", decimal=",")

In [8]:
X = data.iloc[:, :-1]
y = data["RATE"]

#### Comparación

In [11]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_error, zero_one_loss 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

results = {
    "model": [],
    "MZE": [],
    "MAE": []
}

# Create instances of the classifiers
classifier1 = RandomForestClassifier()
classifier2 = SVC(probability=True)  # SVC needs probability estimates
classifier3 = MLPClassifier((200, 100, 39))

# Create instances of MultipleModelOrdinalClassifier
ordinal_classifier1 = MultipleModelOrdinalClassifier(classifier1)
ordinal_classifier2 = MultipleModelOrdinalClassifier(classifier2)
ordinal_classifier3 = MultipleModelOrdinalClassifier(classifier3)

# Compare results without using MultipleModelOrdinalClassifier
predictions1 = cross_val_predict(classifier1, X, y, cv=10)
mze1 = zero_one_loss(y, predictions1)
mae1 = mean_absolute_error(y, predictions1)

results['model'].append('RandomForest')
results['MZE'].append(mze1)
results['MAE'].append(mae1)

predictions2 = cross_val_predict(classifier2, X, y, cv=10)
mze2 = zero_one_loss(y, predictions2)
mae2 = mean_absolute_error(y, predictions2)
results['model'].append('SVC')
results['MZE'].append(mze2)
results['MAE'].append(mae2)

predictions3 = cross_val_predict(classifier3, X, y, cv=10)
mze3 = zero_one_loss(y, predictions3)
mae3 = mean_absolute_error(y, predictions3)
results['model'].append('MLP')
results['MZE'].append(mze3)
results['MAE'].append(mae3)

# Compare results using MultipleModelOrdinalClassifier
ordinal_predictions1 = cross_val_predict(ordinal_classifier1, X, y, cv=10)
ordinal_mze1 = zero_one_loss(y, ordinal_predictions1)
ordinal_mae1 = mean_absolute_error(y, ordinal_predictions1)
results['model'].append('MMOC - RandomForest')
results['MZE'].append(ordinal_mze1)
results['MAE'].append(ordinal_mae1)

ordinal_predictions2 = cross_val_predict(ordinal_classifier2, X, y, cv=10)
ordinal_mze2 = zero_one_loss(y, ordinal_predictions2)
ordinal_mae2 = mean_absolute_error(y, ordinal_predictions2)
results['model'].append('MMOC - SVC')
results['MZE'].append(ordinal_mze2)
results['MAE'].append(ordinal_mae2)

ordinal_predictions3 = cross_val_predict(ordinal_classifier3, X, y, cv=10)
ordinal_mze3 = zero_one_loss(y, ordinal_predictions3)
ordinal_mae3 = mean_absolute_error(y, ordinal_predictions3)
results['model'].append('MMOC - MLP')
results['MZE'].append(ordinal_mze3)
results['MAE'].append(ordinal_mae3)

df = pd.DataFrame(results)

In [12]:
print(df.to_latex(index=False))

\begin{tabular}{lrr}
\toprule
              model &      MZE &      MAE \\
\midrule
       RandomForest & 0.330502 & 0.363707 \\
                SVC & 0.558301 & 0.795367 \\
                MLP & 0.566795 & 0.736680 \\
MMOC - RandomForest & 0.314286 & 0.335135 \\
         MMOC - SVC & 0.579923 & 0.641699 \\
         MMOC - MLP & 0.547490 & 0.681853 \\
\bottomrule
\end{tabular}



Podemos ver que en los tres modelos usados, el MAE mejora al emplear el enfoque 
Multiple Ordinal, sin embargo el MZE (1-accuracy) es ligeramente menor en Random 
Forest y SVC.

Esto se debe a que el error disminuye en todos los modelos debido a que las
predicciones erroneas se acercan más a la predición real teniendo en cuenta
el orden, por lo que se obtiene un valor de MAE menor. Sin embargo, el número de 
elementos bien clasificados es similar (ligeramente inferior en RF y SVC y superiro
en MLP), por lo que no hay grandes variaciones en el MZE.

En este caso concreto el resultado es beneficioso debido a que, a pesar de que
falle/acierte en un número similar de instancias, el error que comete es menor,
es decir, la distancia entre la clase real y la predicha es menor,
y no es lo mismo clasificar un crédito bancario con valor 1 cuando realmente es 
4 que clasificarlo como 3.


## Ejercicio 2
### Monotonic Regression with XGBoost

In [50]:
results = {
    'model': [],
    'MZE': [],
    'MAE': []
}

### Standard XGBoost
Aplicamos regresión monotónica usando la versión estándar de XGBoost

In [51]:
from xgboost import XGBClassifier

Transformamos las clases porque XGBClassifier espera que las clases
sean de 0 a n-1 donde n es el número de clases, pero nuestras
clases toman valores de 1 a n.

In [52]:
y_xgb = y - 1

Puesto que se especifica que todos los valores de entrada tienen una relación
monótona inversa al valor de la clase, lo especificamos en las restricciones de
monotonicidad

In [53]:

# Monotonicity
# 0: without constrains, 1: positive, -1: negative
feature_monotones = [-1] * (X.shape[1])

In [54]:

# Hiperparámetros de XGBoost
params = {'max_depth': 2,
          'eta': 0.1,
          'nthread': 2,
          'seed': 0,
          'monotone_constraints': '(' + ','.join([str(m) for m in feature_monotones]) + ')'
         }

clf = XGBClassifier(**params)

# Validación cruzada 10-fold
predictions_xgb = cross_val_predict(clf, X, y_xgb, cv=10)

# Cálculo de las métricas de error
xgb_mze = zero_one_loss(y_xgb, predictions_xgb)
xgb_mae = mean_absolute_error(y_xgb, predictions_xgb)

results['model'].append('XGBClassifier')
results['MZE'].append(xgb_mze)
results['MAE'].append(xgb_mae)

pd.DataFrame(results)

Unnamed: 0,model,MZE,MAE
0,XGBClassifier,0.318919,0.349035


### Monotonic Regression with OVA XGBoost

Implementación de la versión OVA de XGBoost. 

Se ha usado la implementación 
proporcionada y se ha adaptado ligeramente para permitir el paso de parámetros al 
clasificador.

In [55]:
# Implementación de la versión OVA de XGBoost
class XGBMonotonicClassifier(BaseEstimator,ClassifierMixin): 
    # I inherit from these classes so that the class is compatible with the sklearn API.
    def __init__(self, *, xgb_params=None):
        self.xgboosters = []
        self.xgb_params = xgb_params 

    def _fit(self,X,y):
        #We keep the labels and put them in order
        self.labels = np.sort(y.unique())
        #Iterate through all the labels except the last one
        for i,label in enumerate(self.labels[:-1]):
            smaller_labels = self.labels[:i+1] #We make the set of tags <= than the current one
            greater_labels = self.labels[i+1:] #Makes the set of labels > than the current one

            #We build two dictionaries to replace the label values in the dataset.
            smaller_replacements = { label:0 for label in smaller_labels} 
            greater_replacements = { label:1 for label in greater_labels} 
            smaller_replacements.update(greater_replacements)

            y_i = y.replace(smaller_replacements)

            #We create an xgb classifier and fit it to the dataset on the replaced tags.
            #We place monotonic constraints on all variables with monotone_constraints = "1"
            classifier = XGBClassifier(**self.xgb_params)
            classifier.fit(X,y_i)
            #We put the classifier in xgboosters
            self.xgboosters.append(classifier)

    def _predict(self,X):
        predictions = np.zeros(X.shape[0]) #We initialise the array of predictions with everything 0s
        for xgboost in self.xgboosters:
            #For each xgboost we make a prediction that will give 0 or 1
            predictions += xgboost.predict(X) #We add up the prediction of the examples which will be 0 or 1
            
        #Replace the indexes of the labels with the actual labels
        final_preds = pd.Series(predictions).replace({i:label for i,label in enumerate(self.labels)})

        return final_preds


    def fit(self,X, y):
        """
        Fit the model to the data
        Parameters
        ----------
        X predictor variables
        y predictor variable
        """
        self._fit(X,y)

    def predict(self,X):
        """
        Makes a prediction on a set of predictor variables
        Parameters
        ----------
        X predictor variables

        Returns
        ----------
        labels labels predicted by the model
        """
        return self._predict(X)

In [56]:

# Hiperparámetros de XGBoost
params = {'max_depth': 2,
          'eta': 0.1,
          'nthread': 2,
          'seed': 0,
          'monotone_constraints': '(' + ','.join([str(m) for m in feature_monotones]) + ')'
         }

clf = XGBMonotonicClassifier(xgb_params=params)

# Validación cruzada 10-fold
predictions_xgb = cross_val_predict(clf, X, y_xgb, cv=10)

# Cálculo de las métricas de error
xgb_mze = zero_one_loss(y_xgb, predictions_xgb)
xgb_mae = mean_absolute_error(y_xgb, predictions_xgb)

results['model'].append('XGBClassifier - OVA')
results['MZE'].append(xgb_mze)
results['MAE'].append(xgb_mae)

pd.DataFrame(results)


Unnamed: 0,model,MZE,MAE
0,XGBClassifier,0.318919,0.349035
1,XGBClassifier - OVA,0.366023,0.392278


En este caso vemos que el algoritmo XGBoost estándar nos da un mejor 
resultado que la aproximación One-Versus-All en ambas métricas.

## Monotonic Regression with LightGBM

In [57]:
results = {
    'model': [],
    'MZE': [],
    'MAE': []
}

### Standard LightGBM

In [58]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_predict

In [59]:
clf = LGBMClassifier(monotone_constraints=feature_monotones, verbosity=-1)
predictions_lgbm = cross_val_predict(clf, X, y, cv=10)

lgbm_mze = zero_one_loss(y, predictions_lgbm)
lgbm_mae = mean_absolute_error(y, predictions_lgbm)

results['model'].append('LightGBM')
results['MZE'].append(lgbm_mze)
results['MAE'].append(lgbm_mae)

pd.DataFrame(results)

Unnamed: 0,model,MZE,MAE
0,LightGBM,0.30888,0.337452


### OVA LightGBM 

In [60]:

# Implementación de la versión OVA de LightGBM
class LightGBMonotonicClassifier(BaseEstimator,ClassifierMixin): 
    # I inherit from these classes so that the class is compatible with the sklearn API.
    def __init__(self, *, lgbm_params=None):
        self.lightgbms = []
        self.lgbm_params = lgbm_params 

    def _fit(self,X,y):
        #We keep the labels and put them in order
        self.labels = np.sort(y.unique())
        #Iterate through all the labels except the last one
        for i,label in enumerate(self.labels[:-1]):
            smaller_labels = self.labels[:i+1] #We make the set of tags <= than the current one
            greater_labels = self.labels[i+1:] #Makes the set of labels > than the current one

            #We build two dictionaries to replace the label values in the dataset.
            smaller_replacements = { label:0 for label in smaller_labels} 
            greater_replacements = { label:1 for label in greater_labels} 
            smaller_replacements.update(greater_replacements)

            y_i = y.replace(smaller_replacements)

            #We create an lgbm classifier and fit it to the dataset on the replaced tags.
            #We place monotonic constraints on all variables with monotone_constraints = "1"
            classifier = LGBMClassifier(**self.lgbm_params)
            classifier.fit(X,y_i)
            #We put the classifier in xgboosters
            self.lightgbms.append(classifier)

    def _predict(self,X):
        predictions = np.zeros(X.shape[0]) #We initialise the array of predictions with everything 0s
        for lgbm in self.lightgbms:
            #For each lightgbm we make a prediction that will give 0 or 1
            predictions += lgbm.predict(X) #We add up the prediction of the examples which will be 0 or 1
            
        #Replace the indexes of the labels with the actual labels
        final_preds = pd.Series(predictions).replace({i:label for i,label in enumerate(self.labels)})

        return final_preds


    def fit(self,X, y):
        """
        Fit the model to the data
        Parameters
        ----------
        X predictor variables
        y predictor variable
        """
        self._fit(X,y)

    def predict(self,X):
        """
        Makes a prediction on a set of predictor variables
        Parameters
        ----------
        X predictor variables

        Returns
        ----------
        labels labels predicted by the model
        """
        return self._predict(X)

In [61]:
lgbm_params = {
    'monotene_constraints': feature_monotones,
    'verbosity': -1
}

clf = LightGBMonotonicClassifier(lgbm_params=lgbm_params)
predictions_lgbm = cross_val_predict(clf, X, y, cv=10)

lgbm_mze = zero_one_loss(y, predictions_lgbm)
lgbm_mae = mean_absolute_error(y, predictions_lgbm)

results['model'].append('LightGBM - OVA')
results['MZE'].append(lgbm_mze)
results['MAE'].append(lgbm_mae)

pd.DataFrame(results)

Unnamed: 0,model,MZE,MAE
0,LightGBM,0.30888,0.337452
1,LightGBM - OVA,0.272587,0.288803


En este caso, al contrario que con XGBoost vemos una mejora significativa
de la versión One-Versus-All frente a la estándar aplicando las mismas
restricciones de monotonía en ambas. 