In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from df_utils import get_companies_list, get_X_y
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, \
                            precision_score, accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.feature_selection import SelectKBest

#### Load the dataframe and companies with not too many nans

In [2]:
df, companies = get_companies_list(2)
# learner history parameters
nhist = 10
nfut = 2
totHist = int(3*365)

comp_dict = {}
for i, comp in enumerate(companies):
    comp_dict[comp] = get_X_y(df, comp, nfut, nhist, totHist)

#### Drop some columns:

In [3]:
X_orig, y, ysim = comp_dict[comp]
print(X_orig.columns.values)
print(y.columns.values)
print(ysim.columns.values)

include = ['c_oend_%', 'c_slow_%', 'c_shigh_%', 'change_Me', 'offer_sell_', 'offer_buy_'] #['c_shigh_', 'c_slow_', 'offer_sell_', 'offer_buy_', 'sales_low_', 'sales_high_']
cols_keep = [col for col in X_orig.columns if col[:-4] in include] 
y_cols = ['offer_end_change', 'sale_low_change', 'sale_high_change']

['offer_end_-01' 'offer_end_-02' 'offer_end_-03' 'offer_end_-04'
 'offer_end_-05' 'offer_end_-06' 'offer_end_-07' 'offer_end_-08'
 'offer_end_-09' 'offer_end_-10' 'offer_buy_-01' 'offer_buy_-02'
 'offer_buy_-03' 'offer_buy_-04' 'offer_buy_-05' 'offer_buy_-06'
 'offer_buy_-07' 'offer_buy_-08' 'offer_buy_-09' 'offer_buy_-10'
 'offer_sell_-01' 'offer_sell_-02' 'offer_sell_-03' 'offer_sell_-04'
 'offer_sell_-05' 'offer_sell_-06' 'offer_sell_-07' 'offer_sell_-08'
 'offer_sell_-09' 'offer_sell_-10' 'sales_low_-01' 'sales_low_-02'
 'sales_low_-03' 'sales_low_-04' 'sales_low_-05' 'sales_low_-06'
 'sales_low_-07' 'sales_low_-08' 'sales_low_-09' 'sales_low_-10'
 'sales_high_-01' 'sales_high_-02' 'sales_high_-03' 'sales_high_-04'
 'sales_high_-05' 'sales_high_-06' 'sales_high_-07' 'sales_high_-08'
 'sales_high_-09' 'sales_high_-10' 'change_Me_-01' 'change_Me_-02'
 'change_Me_-03' 'change_Me_-04' 'change_Me_-05' 'change_Me_-06'
 'change_Me_-07' 'change_Me_-08' 'change_Me_-09' 'change_Me_-10'
 'c_s

### Plot pictures:

In [4]:
def plot_sim(comp, ysim, ndays, *args):
    
    
    plt.figure(figsize = (12,6))
    x = pd.to_datetime(ysim.index, format = '%d.%m.%Y')[:ndays]
    yl = ysim['sales_low_000'].values[:ndays]
    yh = ysim['sales_high_000'].values[:ndays]
    #y = ysim['offer_end_prev'].values[:ndays]
    
    #plt.plot_date(x,y, linestyle = '-', marker = None)
    plt.fill_between(x,yl,yh, linestyle = '-')
    plt.ylabel('Sales lowest to highest filled')
    #[plt.gca().axvline(xi, alpha = .1) for xi in x]
    
    if len(args) == 3:
        pred, truth, col = [arg[:ndays] for arg in args]
        
        correct = pred & truth
        fp = pred & np.logical_not(truth)
        
        plt.scatter(x[correct], yh[correct], s = 40, alpha = .7, 
                    c = 'green', label = col + ' correct')
        plt.scatter(x[truth], yh[truth], s = 20, c = 'red', alpha = .2,
                    label = col + ' true')
        
    else:    
        plt.twinx()
        y2 = ysim['offer_end_change']*100
        plt.plot_date(x,y2, linestyle = '-', marker = None, color = 'red')
        plt.ylabel('Offer end percentage change')
    
    plt.legend(frameon = False)
    plt.title(comp, fontsize = 14)
    plt.show()
   

### Make estimator pipe

In [5]:
pipe_rf = Pipeline([('pol', PolynomialFeatures(degree = 2, interaction_only = True)),
                    ('var', VarianceThreshold()),
                    ('sel', SelectKBest()),
                    ('rf', RandomForestClassifier())]) 

params_rf = [{'sel__k': np.arange(40,100,10),
              'rf__max_features': np.arange(5,30,5),
              'rf__max_depth': [5,10,20],
              'rf__n_estimators': [20,50,100]}]

pipe_gbm = Pipeline([('pol', PolynomialFeatures(degree = 2, interaction_only = True)),
                     ('var', VarianceThreshold()),
                     ('sel', SelectKBest()),
                     ('gbm', GradientBoostingClassifier())]) 

params_gbm = [{'sel__k': np.arange(20,50,10),
               'gbm__learning_rate': np.linspace(.05,.4, 8),
               'gbm__max_depth': [3,5],
               'gbm__n_estimators': np.arange(50, 200, 25) }]


def get_pipe(key):
    if key == 'rf':
        return pipe_rf, params_rf
    elif key == 'gbm':
        return pipe_gbm, params_gbm


### Make custom fit for each company:

In [6]:
def comp_estimator(comp, y, threshold, pipe, params, ntest = 50, metric = 'roc_auc'):
    
    splitter = StratifiedKFold(n_splits = 5, shuffle = True) #, random_state = 0)
    X,_,ysim = comp_dict[comp]
    X = X[cols_keep]
    
    y_bin = threshold < y
    
    X_test, y_test = X[:ntest], y_bin[:ntest]
    X_train, y_train = X[ntest:], y_bin[ntest:]
    Xy = [X_train, X_test, y_train, y_test, ysim, y_bin]
    
    
    grid = GridSearchCV(pipe, params, scoring = metric, n_jobs = 5, 
                        cv = splitter, verbose = 1)
    grid.fit(X_train, y_train)
    
    return Xy, grid.best_estimator_, grid.best_params_, grid.best_score_


def get_scores(y_pred_train, y_pred_test, y_pred_train_p, 
               y_pred_test_p, y_train, y_test, show_report = False):
    try:
        roc_auc_test = roc_auc_score(y_test, y_pred_test_p)
    except ValueError as e:
        print(e)
        roc_auc_test = -1
    
    precision_test = precision_score(y_test, y_pred_test)
    
    if show_report:
        print('Train set classification')
        print(classification_report(y_train, y_pred_train, target_names = ['not rise', 'rise']))
        print(confusion_matrix(y_train, y_pred_train))
        print('Train roc_auc = {:.3f}'.format(roc_auc_score(y_train, y_pred_train_p)))
        print('\nTest set classification')
        print(classification_report(y_test, y_pred_test, target_names = ['not rise', 'rise']))
        print(confusion_matrix(y_test, y_pred_test))
        print('Test roc_auc = {:.3f}\n'.format(roc_auc_test))
    
    return roc_auc_test, precision_test

def get_prediction(Xy, estimator, show_report = False):
    
    X_train, X_test, y_train, y_test = Xy[:4]
    y_pred_train = estimator.predict(X_train)
    y_pred_test = estimator.predict(X_test)
    y_pred_train_p = estimator.predict_proba(X_train)[:,1]
    y_pred_test_p = estimator.predict_proba(X_test)[:,1]
    
    
    return y_pred_train, y_pred_test, y_pred_train_p, y_pred_test_p


### Feed the pipe to fit for each company and collect the results

In [7]:
def fit_and_report(companies, key = 'rf', ntest = 100, show = False):
    
    thres = {'offer_end_change':.0, 
             'sale_low_change+1':.0,
             'sale_low_change':.0,
             'sale_high_change':.0}
    
    pipe, params = get_pipe(key)
    
    try:
        results_dict = np.load('opm_params_{}.npy'.format(key)).item()        
    except FileNotFoundError:
        results_dict = {}
    
    for comp in companies:
        y = comp_dict[comp][1]
        for col in y.columns.values:
            try:
                results_dict[comp][col]
            except KeyError:
                
                
                print(comp, ' ', col)

                X,_,ysim = comp_dict[comp]
                X = X[cols_keep]

                Xy, estimator_r, params_opm_r, score_r \
                    = comp_estimator(comp, y[col].values,
                                     thres[col], pipe, params,
                                     ntest = ntest,
                                     metric = 'roc_auc')

                y_pred_train, y_pred_test, y_pred_train_p, y_pred_test_p \
                    = get_prediction(Xy, estimator_r)

                roc_auc, precision = get_scores(y_pred_train, 
                                                y_pred_test, 
                                                y_pred_train_p, 
                                                y_pred_test_p, 
                                                Xy[2], Xy[3], 
                                                show_report = show)

                if show:
                    print(params_opm_r)
                    y_pred_tot = np.concatenate((y_pred_test, 
                                                 y_pred_train))
                    plot_sim(comp, Xy[4], ntest*4, y_pred_tot, 
                             Xy[5], col)


                col_dict = {col: {'roc_auc':roc_auc, 
                                  'precision':precision,
                                  'threshold':thres[col],
                                  'opm_params':params_opm_r}}
                try:
                    results_dict[comp].update(col_dict)
                except KeyError:
                    results_dict[comp] = col_dict
                 
                np.save('opm_params_{}'.format(key), results_dict)
    return results_dict


    

In [8]:
method_key = 'gbm'
results_dict = fit_and_report(companies, method_key, ntest = 70, show = True)

## Then test the obtained results:
### First make the prediction dataframe

In [9]:
# optimal parameters:
opm_params = np.load('opm_params_{}.npy'.format(method_key)).item()


def get_predictions(ntest, comp, col):
    
    X, y, ysim = comp_dict[comp]
    X = X[cols_keep]
    
    y_bin = opm_params[comp][col]['threshold'] < y[col]    
    X_test, y_test = X[:ntest], y_bin[:ntest]
    X_train, y_train = X[ntest:], y_bin[ntest:]
    
    pipe, _ = get_pipe(method_key)
    pipe.set_params(**opm_params[comp][col]['opm_params']) 
    pipe.fit(X_train, y_train)
    
    return pipe.predict(X_test)

def get_prediction_df(ntest):
    idx_tuples = [(comp, col) for comp in companies for col in y.columns]
    index = pd.MultiIndex.from_tuples(idx_tuples)
    
    
    pred_df = pd.DataFrame(index = comp_dict['Alma Media'][0].index[:ntest], columns = index)
    
    for comp in companies:
        ysim = comp_dict[comp][-1]

        for col in y.columns:
            pred_df[(comp, col)] = get_predictions(ntest, comp, col) #np.zeros(ntest)
        
        pred_df[(comp, 'sales_low_prev')] = ysim['sales_low_prev']
        pred_df[(comp, 'sales_high_prev')] = ysim['sales_high_prev']
        
        pred_df[(comp, 'sales_low_000'.format(nfut-1))] = ysim['sales_low_000'.format(nfut-1)]
        pred_df[(comp, 'sales_high_{:03d}'.format(nfut-1))] = ysim['sales_high_{:03d}'.format(nfut-1)]
        pred_df[(comp, 'offer_end_{:03d}'.format(nfut-1))] = ysim['offer_end_{:03d}'.format(nfut-1)]
    pred_df.index = comp_dict[comp][0].index.values[:ntest]
    return pred_df

df_pred = get_prediction_df(100) 


In [106]:
def simulate(pred_df, marginal = 1.5, roc_thres = .6, dealer = 3.):
    
    shop_df = pd.DataFrame(columns = ['index','day','comp','nstock', 'buy', 'sell'])
    shop_df.set_index(['index','day', 'comp'], inplace = True)
    
    # Defines the condition when stock is bought
    buy_condition = {'sale_low_change+1':False, 'sale_high_change':True}
    free_money = 1e4
    action = False
    for i, day in enumerate(df_pred.index[::-1]):
        basket = []
        for comp in companies:
            can_shop = False
            # Condition to buy:
            if all([pred_df.loc[day, comp][key] == val for key,val in buy_condition.items()]) and \
                i+2 < len(df_pred):
                # Make sure today sale low is smaller than yesterday:
                if pred_df.loc[day, comp]['sales_low_000'] < pred_df.loc[day, comp]['sales_low_prev']:
                    buy_price = pred_df.loc[day, comp]['sales_low_prev']
                    
                    if buy_price*(1+marginal/100.) \
                    < pred_df.loc[day, comp]['sales_high_{:03d}'.format(nfut-1)]: 
                        sell_price = buy_price*(1+marginal/100.)
                    else:
                        sell_price = pred_df.loc[day, comp]['offer_end_{:03d}'.format(nfut-1)]
                        #print('Fell', buy_price, sell_price)
                    
                    can_shop = True
            if can_shop:
                basket.append((comp, buy_price, sell_price))
        
        for scomp, buy, sell in basket:
            low_roc = opm_params[scomp]['sale_low_change+1']['roc_auc']
            high_roc = opm_params[scomp]['sale_high_change']['roc_auc']
            if (roc_thres < np.array([low_roc, high_roc])).all():
                n_stock = int(free_money/2/buy)
                free_money -= n_stock*buy  + dealer
                shop_df.loc[(i, day, scomp), ['nstock', 'buy', 'sell']] = n_stock, buy, sell 
                action = True
                print('Bought {} price={:.02f}'.format(scomp, buy))
        if action:
            # Time to sell:
            if i-1 in list(zip(*shop_df.index.values))[0]:
                for idx in [idx for idx in shop_df.index.values if idx[0] == i-1]:
                    n, price = shop_df.loc[idx][['nstock', 'sell']]
                    free_money += n*price - dealer
                    print('Sold {} price={:.02f}'.format(idx[2], price))
                print()
    print('Money in end: {:0f}'.format(free_money))
    
simulate(df_pred, 5, .615)

Bought Kemira price=11.20
Sold Kemira price=11.36

Bought Amer Sports A price=21.23
Sold Amer Sports A price=20.97

Bought Elisa price=34.76
Bought Technopolis price=3.66
Sold Elisa price=34.95
Sold Technopolis price=3.73

Bought Technopolis price=3.67
Bought Kemira price=11.57
Sold Technopolis price=3.64

Bought Kemira price=11.54
Sold Kemira price=11.48

Sold Kemira price=11.43

Bought Amer Sports A price=22.12
Bought Citycon price=2.29
Sold Amer Sports A price=22.06
Sold Citycon price=2.27

Bought Kemira price=11.09
Bought Neste price=34.27
Sold Kemira price=11.20
Sold Neste price=34.75

Bought Elisa price=33.72
Bought Neste price=34.45
Bought Technopolis price=3.69
Sold Elisa price=33.74
Sold Neste price=34.96
Sold Technopolis price=3.68

Bought Amer Sports A price=22.55
Sold Amer Sports A price=23.35

Bought Kemira price=11.31
Sold Kemira price=11.45

Bought Amer Sports A price=23.13
Bought Technopolis price=3.68
Bought Elisa price=34.91
Sold Amer Sports A price=23.26
Sold Technop