In [94]:
from sklearn.model_selection import RandomizedSearchCV

help(RandomizedSearchCV)

Help on class RandomizedSearchCV in module sklearn.model_selection._search:

class RandomizedSearchCV(BaseSearchCV)
 |  RandomizedSearchCV(estimator, param_distributions, *, n_iter=10, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=nan, return_train_score=False)
 |  
 |  Randomized search on hyper parameters.
 |  
 |  RandomizedSearchCV implements a "fit" and a "score" method.
 |  It also implements "score_samples", "predict", "predict_proba",
 |  "decision_function", "transform" and "inverse_transform" if they are
 |  implemented in the estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated search over parameter settings.
 |  
 |  In contrast to GridSearchCV, not all parameter values are tried out, but
 |  rather a fixed number of parameter settings is sampled from the specified
 |  distributions. The number of parameter settings that are tried is
 |  

In [133]:
#%%writefile train_model.py
from datetime import datetime
from typing import List, Dict, Optional, NamedTuple

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import precision_recall_curve

USER_ID_COL = 'ID'
DATE_COL = 'ActiveDate'
TARGET_COL = 'target'
RANDOM_STATE = 0

MODEL_PARAMS = {
    'max_depth': 5,
    'objective': 'binary',
    'metric': 'binary_logloss',
    'min_data_in_leaf': 30,
    'learning_rate': 0.02,
    'num_leaves': 50,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.65,
    'num_boost_round': 500,
    'random_state': RANDOM_STATE
}

FIT_PARAMS = {
    'early_stopping_rounds':35,
    'verbose': 50
}



# customize data objects
class SplitedData(NamedTuple):
    train: Dict[str, pd.DataFrame]
    test: Dict[str, pd.DataFrame]
    validation: Dict[str, pd.DataFrame]
        

class ModelArtifacts(NamedTuple):
    estimator: lgb.LGBMClassifier
    splited_data: SplitedData
    train_cols: List[str]
    optimal_threshold: float
        
        
def data_spliting(full_data: pd.DataFrame, train_ratio_size: float = 0.80, 
                     test_ratio_size: float = 0.10):
    """
    """
    accumlate_percetage = (full_data.groupby(DATE_COL)[USER_ID_COL].count().cumsum()/full_data.index.size).rename('accumlate_percetage')
    
    max_train_date = accumlate_percetage[accumlate_percetage<=train_ratio_size].idxmax()
    max_test_date = accumlate_percetage[accumlate_percetage<=train_ratio_size+test_ratio_size].idxmax()
    
    train = full_data[full_data[DATE_COL] <= max_train_date]
    test = full_data[(full_data[DATE_COL] > max_train_date) & (full_data[DATE_COL] <= max_test_date)]
    validation = full_data[full_data[DATE_COL] > max_test_date]
    
    train_cols = train.columns.difference([TARGET_COL])
    
    return SplitedData(
        train = {'X': train[train_cols], 'y': train[TARGET_COL]},
        test = {'X': test[train_cols], 'y': test[TARGET_COL]},
        validation = {'X': validation[train_cols], 'y': validation[TARGET_COL]}
    )
    
    
def train_model(splited_data: SplitedData, train_cols: List[str],
                train_weigts_vector: Optional[List[float]] = None):
    """
    """
    # gen calssifier and train
    estimator = lgb.LGBMClassifier(**MODEL_PARAMS)
    
    # set fit parametrs
    FIT_PARAMS['eval_set'] = [[splited_data.test['X'][train_cols], splited_data.test['y']]]
    FIT_PARAMS['eval_metric']= ['logloss']
    
    if isinstance(train_weigts_vector, list):
        FIT_PARAMS['sample_weight'] = train_weigts_vector
        
    estimator.fit(splited_data.train['X'][train_cols], splited_data.train['y'], **FIT_PARAMS)
    
    return estimator


def extract_optimal_threshold(estimator: lgb.LGBMClassifier, splited_data: SplitedData,
                             train_cols: List[str]):
    """
    """
    y_test_proba = estimator.predict_proba(splited_data.test['X'][train_cols])[:,1]
    precision, recall, thresholds = precision_recall_curve(splited_data.test['y'], y_test_proba)
    numerator = 2 * recall * precision
    denom = recall + precision
    f1_scores = np.divide(numerator, denom, out=np.zeros_like(denom), where=(denom!=0))
    optimal_threshold = thresholds[np.argmax(f1_scores)]    
    
    return optimal_threshold


def model_exe(full_data: pd.DataFrame, train_cols: List[str], 
              positve_weight: Optional[float] = None) -> ModelArtifacts:
    """
    """
    splited_data = data_spliting(full_data)
    if positve_weight:
        train_weigts_vector = list(np.where(splited_data.train['y']==1, positve_weight, 1))
    else:
        train_weigts_vector = None
        
    estimator = train_model(splited_data, train_cols, train_weigts_vector=train_weigts_vector) 
    
    optimal_threshold = extract_optimal_threshold(estimator, splited_data, train_cols)
    
    return ModelArtifacts(
        estimator=estimator, 
        splited_data=splited_data,
        train_cols=train_cols,
        optimal_threshold=optimal_threshold
    )


In [129]:
%%time
import preprocess
import feature_engineering
rawdata = preprocess.preprocess()
feature_engineering_response = feature_engineering.presplit_feature_engineering(rawdata)

  call = lambda f, *a, **k: f(*a, **k)


CPU times: user 11.5 s, sys: 3.99 s, total: 15.5 s
Wall time: 17.3 s


In [None]:
model_artifacts = model_exe(
    feature_engineering_response.enriched_data,
    feature_engineering_response.lag_features_organizer.all_new_features,
    2.5
)



Training until validation scores don't improve for 35 rounds
[50]	valid_0's binary_logloss: 0.186015
[100]	valid_0's binary_logloss: 0.17094
[150]	valid_0's binary_logloss: 0.165488
[200]	valid_0's binary_logloss: 0.163311
[250]	valid_0's binary_logloss: 0.162361


In [122]:
model_artifacts.optimal_threshold

0.15872793629329765

In [None]:
#     max_train_date = accumlate_percetage[
#         min(accumlate_percetage, key=lambda data_ratio: abs(data_ratio-train_ratio_size))==accumlate_percetage].index[0]
    
#     max_test_date = accumlate_percetage[
#         min(accumlate_percetage, key=lambda data_ratio: abs(data_ratio-(train_ratio_size+test_ratio_size)))==accumlate_percetage].index[0]

In [12]:
full_data = feature_engi_neering_reponse.enriched_data

In [19]:
full_data[['EOM_Equity', 'EOM_Equity_lag_1', 'next_EOM_Equity','target']][full_data['EOM_Equity_lag_1']<25]

Unnamed: 0,EOM_Equity,EOM_Equity_lag_1,next_EOM_Equity,target
252142,98.26,5.00,5.07,1
74000,5048.90,0.00,0.00,1
74001,1678.80,0.00,2032.37,0
14241,1853.44,0.00,2152.32,0
244560,401.13,2.12,392.45,0
...,...,...,...,...
223120,376.42,0.00,385.77,0
30443,919.88,0.00,70.22,0
108665,204.72,0.53,850.92,0
154066,50.22,0.00,24.98,1


In [None]:
full_data.sort_values(by='')