In [26]:
# Importacion de paquetes necesarios
import pandas as pd
import numpy as np
import csv
import os
from pathlib import Path
import datetime
from datetime import date
from imblearn.over_sampling import SMOTE
import random


from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from hyperopt import hp, fmin, tpe, hp, STATUS_OK, Trials
from hyperopt import Trials
from timeit import default_timer as timer
import ml_metrics   

pd.set_option('display.max_rows', 40)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


#### Exportamos el dataset limpio

In [27]:
df = pd.read_csv("../data_predictive/clean_sold.csv", index_col=0)


#### Seteamos la Seed

In [28]:
#### Apparently you may use different seed values at each stage
seed_value= 0
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)
# 4. Set the `tensorflow` pseudo-random generator at a fixed value

#### Hacemos un subsampleo de la clase mayoritaria para balancear las proporciones de cada categoria a predecir.

In [4]:
y = df["sold_quantity"]
X = df.drop(["sold_quantity"],axis=1)

smote = SMOTE()
X_sm, y_sm = smote.fit_sample(X, y)

X_sm_sample = X_sm.sample(frac=0.3)
y_sm_sample = y_sm[X_sm_sample.index]

X_train, X_test, y_train, y_test = train_test_split(X_sm_sample, y_sm_sample, train_size=0.7)

In [5]:
exp_name = 'test_exp'
folder = 'Resultados/' + exp_name
my_file = Path(folder)
if os.path.exists(my_file):
    print('already exists')
else:
    os.makedirs(folder)

out_file = folder + '/overrf_results.csv'
of_connection = open(out_file, 'w')

already exists


In [22]:
space = {"criterion" : hp.choice('criterion', ["gini", "entropy"]),
     "max_features"  : hp.choice('max_features', ['sqrt','log2',0.2,0.5,0.8]),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(50,700,50)),
    'bootstrap' : hp.choice('bootstrap',[True,False]),
    'gamma' : hp.quniform('gamma', 0, 0.5, 0.01),
    'max_depth' : hp.choice('max_depth', range(5, 300, 30)),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01)
    }


In [23]:
#Objective function that hyperopt will minimize
from timeit import default_timer as timer
import ml_metrics   

def objective(params):
    start = timer()
    print ('Params testing: ', params)
    print ('\n ')
    
    n_estimators = params["n_estimators"]
    max_depth = params["max_depth"]
    bootstrap = params["bootstrap"]
    criterion = params["criterion"]
    max_features = params["max_features"]

    model = RandomForestClassifier(n_estimators= n_estimators, max_depth=max_depth,bootstrap=bootstrap,
                                 criterion=criterion,max_features=max_features)
#     model.fit(X_train,y_train)
    #predict the test set 
#     predictions = model.predict(X_test)
#     acc = accuracy_score(y_test, predictions)
    skf = StratifiedKFold(n_splits=3, shuffle=True)
    fscore = cross_val_score(estimator = model, X = X_train, y = y_train, cv = skf)
    CrossValMean = fscore.mean()
    print("Final CrossValMean: ", CrossValMean)
    return{'loss':1-CrossValMean, 'status': STATUS_OK }
    
    score = 1-CrossValMean
    run_time = timer() - start
    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([CrossValMean, params, score, run_time])
    of_connection.close()
    print('Test accuracy:', CrossValMean)
 
    return {'loss': score,  'status': STATUS_OK, 'train_time': run_time,}



In [25]:
tpe_algorithm = tpe.suggest
bayes_trials = Trials()
trials = Trials()

# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = 5, trials = bayes_trials,
            verbose = 1, rstate= np.random.RandomState(50))


Params testing:                                                                                                        
{'bootstrap': True, 'colsample_bytree': 0.55, 'criterion': 'gini', 'gamma': 0.4, 'max_depth': 275, 'max_features': 'log2', 'n_estimators': 550, 'subsample': 1.0}
Final CrossValMean:                                                                                                    
0.849655605990642                                                                                                      
Params testing:                                                                                                        
{'bootstrap': True, 'colsample_bytree': 0.48, 'criterion': 'gini', 'gamma': 0.06, 'max_depth': 215, 'max_features': 'log2', 'n_estimators': 500, 'subsample': 0.87}
Final CrossValMean:                                                                                                    
0.849277673920009                                                         

In [30]:
best_overf= {'bootstrap': False,'criterion': 'gini','max_depth': 35, 'max_features': 0.2, 'n_estimators': 550}