In [4]:
import pandas as pd
import numpy as np
import warnings
import sklearn
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, zero_one_loss, mean_absolute_error
from sklearn.base import ClassifierMixin, BaseEstimator, clone
warnings.simplefilter(action='ignore', category=FutureWarning)

# Clasificación

## Lectura y preparación de los datos

In [5]:
data = pd.read_csv("data/rating.csv", delimiter=";", decimal=",")
X = data.iloc[:, :-1]
y = data["RATE"]
y=y-1

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

## Algunos métodos y variables

In [7]:
results = {
    "model": [],
    "MZE": [],
    "MAE": []
}

# Clase para transformar cualquier modelo en un clasificador ordinal.
class MultipleModelOrdinalClassifier(BaseEstimator,ClassifierMixin):

    # I inherit from these classes so that the class is compatible with the sklearn API.
    def __init__(self, classifier):
        """
        Parameters
        ----------
        :classifier: classifier from which to build the model
        """
        self.classifier = classifier
        self.fitted_classifiers = []

    def _fit(self,X,y):
        #We keep the labels and put them in order
        self.labels = np.sort(y.unique())

        #Iterate through all the labels except the last one
        for i,label in enumerate(self.labels[:-1]):
            smaller_labels = self.labels[:i+1] #We make the set of labels <= than the current one
            greater_labels = self.labels[i+1:] #Makes the set of labels > than the current one

            #We build two dictionaries to replace the label values in the dataset.
            smaller_replacements = { label:0 for label in smaller_labels} 
            greater_replacements = { label:1 for label in greater_labels} 
            smaller_replacements.update(greater_replacements)

            y_i = y.replace(smaller_replacements)

            #We create a classifier and fit it to the dataset on the replaced labels.
            classifier = clone(self.classifier)
            classifier.fit(X,y_i)
            #We put the classifier in the fitter_classifiers
            self.fitted_classifiers.append(classifier)
    
    def _predict(self,X):
        predictions_greater = []
        
        #For each classifier we draw predictions in the form of probabilities.
        for cl in self.fitted_classifiers:
            prediction = cl.predict_proba(X)[:,1] #We will stick with the second probability, i.e. with P(target > Vi)
            predictions_greater.append(prediction)

        #We compute the probabilities of the first and last class
        primera = 1 - predictions_greater[0] #first
        ultima = predictions_greater[-1] # last
        probabilidades_clase = [primera]
        #We calculate the remaining probabilities 
        for i,pred in enumerate(predictions_greater):
            if i != 0 and i != len(predictions_greater): #We avoid going through first and last
                prob_i = predictions_greater[i-1]*(1-predictions_greater[i])
                probabilidades_clase.append(prob_i)

        #We insert the last probability so that they are in order in the array.
        probabilidades_clase.append(ultima)

        #We take the index of the probability with the highest value
        predictions = np.argmax(probabilidades_clase,axis=0)
        #We replace the indexes with the actual value of the label
        final_preds = pd.Series(predictions).replace({i:label for i,label in enumerate(self.labels)})

        return final_preds

    def get_modelos(self):
        return self.fitted_classifiers

    def fit(self, X, y):
        """
        Fit the model to the data
        Parameters
        ----------
        X predictor variables
        y predictor variable
        """
        self._fit(X,y)
    def predict(self, X) -> pd.Series:
        """
        Makes a prediction on a set of predictor variables
        Parameters
        ----------
        X predictor variables

        Returns
        ----------
        labels labels predicted by the model
        """
        return self._predict(X)
    
def evaluate(model, name=""):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mze = zero_one_loss(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results['model'].append(name)
    results['MZE'].append(mze)
    results['MAE'].append(mae)

## Prueba de diferentes algoritmos de clasificación

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


# Create instances of the classifiers
rf = RandomForestClassifier()
svc = SVC(probability=True)  # SVC needs probability estimates
mlp = MLPClassifier((200, 100, 39))

# Create instances of MultipleModelOrdinalClassifier
mmoc_rf = MultipleModelOrdinalClassifier(rf)
mmoc_svc = MultipleModelOrdinalClassifier(svc)
mmoc_mlp = MultipleModelOrdinalClassifier(mlp)

classifiers = [
    {'model': rf, 'name': "Random Forest"},
    {'model': svc, 'name': "Support Vector Machine"},
    {'model': mlp, 'name': "Multi-Layer Perceptron"},
    {'model': mmoc_rf, 'name': "MMOC - Random Forerst"},
    {'model': mmoc_svc, 'name': "MMOC - Support Vector Machine"},
    {'model': mmoc_mlp, 'name': "MMOC - Multi-Layer Perceptron"}
]

for c in classifiers:
    evaluate(**c)

# Clasificación con TPOT

In [9]:
tpot = TPOTClassifier(generations=10, population_size=30, verbosity=2, random_state=42)

In [10]:
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/330 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.6797409463388846

Generation 2 - Current best internal CV score: 0.6910705789056305

Generation 3 - Current best internal CV score: 0.6910705789056305

Generation 4 - Current best internal CV score: 0.692101506740682

Generation 5 - Current best internal CV score: 0.6972508591065292

Generation 6 - Current best internal CV score: 0.7003383558022733

Generation 7 - Current best internal CV score: 0.7023896378535553

Generation 8 - Current best internal CV score: 0.7023896378535553

Generation 9 - Current best internal CV score: 0.7023896378535553

Generation 10 - Current best internal CV score: 0.7023896378535553

Best pipeline: GradientBoostingClassifier(MaxAbsScaler(input_matrix), learning_rate=0.01, max_depth=7, max_features=1.0, min_samples_leaf=20, min_samples_split=16, n_estimators=100, subsample=0.7500000000000001)


In [11]:
y_pred = tpot.predict(X_test)

In [12]:
mze_tpot = zero_one_loss(y_test, y_pred)
mae_tpot = mean_absolute_error(y_test, y_pred)

In [13]:
results['model'].append('TPOT')
results['MZE'].append(mze_tpot)
results['MAE'].append(mae_tpot)

## Resultados

In [14]:
pd.DataFrame(results)

Unnamed: 0,model,MZE,MAE
0,Random Forest,0.320988,0.339506
1,Support Vector Machine,0.518519,0.740741
2,Multi-Layer Perceptron,0.601852,0.820988
3,MMOC - Random Forerst,0.330247,0.348765
4,MMOC - Support Vector Machine,0.592593,0.641975
5,MMOC - Multi-Layer Perceptron,0.589506,0.669753
6,TPOT,0.274691,0.311728


# Regresión

## Lectura y preparación de datos

In [9]:
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/housing.csv')
X = df.iloc[:, 1:]
y = df['price']

def yesno_to_binary(df, cols):
    for c in cols:
        df[c] = df[c].replace({
            'yes': 1,
            'no': 0
        })
    return df

X = yesno_to_binary(X, [
    'mainroad',
    'guestroom',
    'basement',
    'hotwaterheating',
    'airconditioning',
    'prefarea'
])

X = pd.concat([X, pd.get_dummies(X['furnishingstatus'], drop_first=True)], axis=1)
X = X.drop('furnishingstatus', axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Prueba de diferentes métodos de regresión

In [12]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb

In [19]:
model = {
    'Random Forest Regressor' : RandomForestRegressor(),
    'Gradient Boost Regressor' : GradientBoostingRegressor(),
    'XGBoost' : xgb.XGBRegressor(),            
    'XGRF Regressor' : xgb.XGBRFRegressor(),   
    'Support Vector regressor' : SVR(),
    'Lasso Reg' : Lasso(),
    'Ridge Reg' : Ridge(),
    'LGBM Reg' : LGBMRegressor(verbosity=-1),
    'Cat Boost' : CatBoostRegressor(verbose=0)
}

In [20]:
pred ={}

for name, model in model.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred[name]=y_pred

In [25]:
from sklearn.metrics import mean_squared_error
results = {
    "model": [],
    "MSE": []
}
for name, y_pred in pred.items():
    mse = mean_squared_error(y_test, y_pred)
    results["model"].append(name)
    results["MSE"].append(mse)
pd.DataFrame(results)

Unnamed: 0,model,MSE
0,Random Forest Regressor,1468585000000.0
1,Gradient Boost Regressor,1301186000000.0
2,XGBoost,1694614000000.0
3,XGRF Regressor,1584954000000.0
4,Support Vector regressor,2888849000000.0
5,Lasso Reg,1276226000000.0
6,Ridge Reg,1274851000000.0
7,LGBM Reg,1559223000000.0
8,Cat Boost,1278838000000.0


## Uso de TPOT para regresión

In [27]:
tpot = TPOTRegressor(generations=10, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -1147032108010.7031

Generation 2 - Current best internal CV score: -1147032108010.7031

Generation 3 - Current best internal CV score: -1139591114326.7454

Generation 4 - Current best internal CV score: -1139591114326.7454

Generation 5 - Current best internal CV score: -1139591114326.7454

Generation 6 - Current best internal CV score: -1139591114326.7454

Generation 7 - Current best internal CV score: -1113749470450.175

Generation 8 - Current best internal CV score: -1113749470450.175

Generation 9 - Current best internal CV score: -1113749470450.175

Generation 10 - Current best internal CV score: -1108149754809.8132

Best pipeline: RandomForestRegressor(RidgeCV(input_matrix), bootstrap=True, max_features=0.7500000000000001, min_samples_leaf=7, min_samples_split=13, n_estimators=100)


In [28]:
y_pred = tpot.predict(X_test)

In [29]:
results["model"].append('TPOT')
results["MSE"].append(mean_squared_error(y_test, y_pred))

## Resultados obtenidos

In [30]:
pd.DataFrame(results)

Unnamed: 0,model,MSE
0,Random Forest Regressor,1468585000000.0
1,Gradient Boost Regressor,1301186000000.0
2,XGBoost,1694614000000.0
3,XGRF Regressor,1584954000000.0
4,Support Vector regressor,2888849000000.0
5,Lasso Reg,1276226000000.0
6,Ridge Reg,1274851000000.0
7,LGBM Reg,1559223000000.0
8,Cat Boost,1278838000000.0
9,TPOT,1272120000000.0
