In [1]:
# Importacion de paquetes necesarios
import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path
import datetime
from datetime import date
from imblearn.over_sampling import SMOTE
import random


from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import Trials
from timeit import default_timer as timer
import ml_metrics   

pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


#### Exportamos el dataset limpio

In [11]:
df = pd.read_csv("../data_predictive/clean_sold.csv", index_col=0)


#### Seteamos la Seed

In [16]:
#### Apparently you may use different seed values at each stage
seed_value= 0
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

#### Partición al 70%, previamente aplicamos SMOTE a todos el dataset para crear registros sinteticos de las clases minoritarias y balancear las clases a predecir. Luego tomamos una muestra del 30% con 70000 registros 

In [13]:
y = df["sold_quantity"]
X = df.drop(["sold_quantity"],axis=1)

smote = SMOTE()
X_sm, y_sm = smote.fit_sample(X, y)

X_sm_sample = X_sm.sample(frac=0.3)
y_sm_sample = y_sm[X_sm_sample.index]

X_train, X_test, y_train, y_test = train_test_split(X_sm_sample, y_sm_sample, train_size=0.7,random_state = seed_value)

In [14]:
exp_name = 'test_exp'
folder = 'Resultados/' + exp_name
my_file = Path(folder)
if os.path.exists(my_file):
    print('already exists')
else:
    os.makedirs(folder)

out_file = folder + '/overxgb_results.csv'
of_connection = open(out_file, 'w')

already exists


#### Espacio de busqueda de los hiperparametros del XGB

In [15]:
space = {
    'n_estimators' : hp.choice('n_estimators', range(50,1000,25)),
    'max_depth' : hp.choice('max_depth', range(5, 300, 5)),
    'alpha' : hp.choice('alpha', range(0,20)),
    'gamma' : hp.quniform('gamma', 0, 0.5, 0.01),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.7, 0.01),
    'objective' : 'multi:softmax'
    }


#### Función que utilizara la hiperoptimización bayesiana aplicando un 5fold CV

In [7]:
#Objective function that hyperopt will minimize



def objective(params):
    start = timer()
    print ('Params testing: ', params)
    print ('\n ')

    model = XGBClassifier(params)
    model.fit(X_train,y_train)
    #predict the test set 
#     predictions = model.predict(X_test)
#     acc = accuracy_score(y_test, predictions)
    skf = StratifiedKFold(n_splits=5, shuffle=True,random_state = seed_value)
    fscore = cross_val_score(estimator = model, X = X_train, y = y_train, cv = skf,scoring="f1_weighted")
    CrossValMean = fscore.mean()
    print("Final CrossValMean: ", CrossValMean)
    return{'loss':1-CrossValMean, 'status': STATUS_OK }
    
    score = 1-CrossValMean
    run_time = timer() - start
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([CrossValMean, params, score, run_time])
    of_connection.close()
    print('Test accuracy:', CrossValMean)
 
    return {'loss': score,  'status': STATUS_OK, 'train_time': run_time,}



#### Corre Optimización con 20 itereaciones

In [8]:
tpe_algorithm = tpe.suggest
bayes_trials = Trials()
trials = Trials()
# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = 20, trials = bayes_trials,
            verbose = 1, rstate= np.random.RandomState(50))


Params testing:                                                                                                        
{'alpha': 6, 'colsample_bytree': 0.38, 'gamma': 0.4, 'learning_rate': 0.42, 'max_depth': 55, 'n_estimators': 375, 'objective': 'multi:softmax', 'subsample': 1.0}
Final CrossValMean:                                                                                                    
0.7722048637838638                                                                                                     
Params testing:                                                                                                        
{'alpha': 15, 'colsample_bytree': 0.15, 'gamma': 0.45, 'learning_rate': 0.36, 'max_depth': 275, 'n_estimators': 275, 'objective': 'multi:softmax', 'subsample': 0.87}
Final CrossValMean:                                                                                                    
0.7723563170169495                                                      

#### Los mejores hiperparametros

In [18]:
best_overxgb={'alpha': 4, 'colsample_bytree': 0.84, 'gamma': 0.43, 'learning_rate': 0.13, 'max_depth': 125, 'n_estimators': 700, 'objective': 'multi:softmax', 'subsample': 0.6}